diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 76f6d7aeca0d..77ee313687fc 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@
import sys
import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
# Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
new file mode 100644
index 000000000000..56ec933c9cc0
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+ metrics:
+ - name: "exact_match,strict-match"
+ value: 0.419
+ - name: "exact_match,flexible-extract"
+ value: 0.416
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
new file mode 100644
index 000000000000..ccb4f84201b7
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+ metrics:
+ - name: "relaxed_accuracy,none"
+ # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
+ value: 0.80
+limit: 100
+num_fewshot: 0
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
new file mode 100644
index 000000000000..46f1a9fbf6ff
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -0,0 +1,10 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+tasks:
+- name: "mmlu_pro"
+ metrics:
+ - name: "exact_match,custom-extract"
+ value: 0.80
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
index a2f235f48581..aa4fb9fa03d6 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
new file mode 100644
index 000000000000..5f3c31743e75
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
+
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+ metrics:
+ - name: "relaxed_accuracy,none"
+ value: 0.855
+limit: 2500
+num_fewshot: 0
diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
new file mode 100644
index 000000000000..4fb0b84bc4d8
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
new file mode 100644
index 000000000000..91e22b6459c1
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
new file mode 100644
index 000000000000..1097d220245f
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@@ -0,0 +1 @@
+Qwen2.5-VL-7B-Instruct.yaml
\ No newline at end of file
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
new file mode 100755
index 000000000000..c8db951381b0
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on chartqa for vllm.
+#
+# Make sure you have lm-eval-harness installed:
+# pip install lm-eval==0.4.9
+
+usage() {
+ echo``
+ echo "Runs lm eval harness on ChartQA using multimodal vllm."
+ echo "This pathway is intended to be used to create baselines for "
+ echo "our correctness tests in vllm's CI."
+ echo
+ echo "usage: ${0} "
+ echo
+ echo " -m - huggingface stub or local directory of the model"
+ echo " -l - limit number of samples to run"
+ echo " -t - tensor parallel size to run at"
+ echo
+}
+
+while getopts "m:l:t:" OPT; do
+ case ${OPT} in
+ m )
+ MODEL="$OPTARG"
+ ;;
+ l )
+ LIMIT="$OPTARG"
+ ;;
+ t )
+ TP_SIZE="$OPTARG"
+ ;;
+ \? )
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+lm_eval --model vllm-vlm \
+ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
+ --tasks chartqa \
+ --batch_size auto \
+ --apply_chat_template \
+ --limit $LIMIT
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
old mode 100644
new mode 100755
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
new file mode 100644
index 000000000000..d85a1721db9a
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+
+usage() {
+ echo``
+ echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
+ echo "This pathway is intended to be used to create baselines for "
+ echo "our automated nm-test-accuracy workflow"
+ echo
+ echo "usage: ${0} "
+ echo
+ echo " -m - huggingface stub or local directory of the model"
+ echo " -l - limit number of samples to run"
+ echo " -f - number of fewshot samples to use"
+ echo " -t - tensor parallel size to run at"
+ echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+ case ${OPT} in
+ m )
+ MODEL="$OPTARG"
+ ;;
+ b )
+ BATCH_SIZE="$OPTARG"
+ ;;
+ l )
+ LIMIT="$OPTARG"
+ ;;
+ f )
+ FEWSHOT="$OPTARG"
+ ;;
+ t )
+ TP_SIZE="$OPTARG"
+ ;;
+ \? )
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+lm_eval --model vllm \
+ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+ --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+ --batch_size auto
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index ceea01166b7f..f10de82b1d8e 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -19,21 +19,27 @@
def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
+ batch_size = eval_config.get("batch_size", "auto")
+ backend = eval_config.get("backend", "vllm")
model_args = (
f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size},"
f"enforce_eager=true,"
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code},"
- f"max_model_len={max_model_len}"
+ f"max_model_len={max_model_len},"
)
results = lm_eval.simple_evaluate(
- model="vllm",
+ model=backend,
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
- batch_size="auto",
+ # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+ # text models. however, this is regressing measured strict-match for
+ # existing text models in CI, so only apply it for mm.
+ apply_chat_template=backend == "vllm-vlm",
+ batch_size=batch_size,
)
return results
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 37e2980eea97..2ef36089b6af 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -8,7 +8,7 @@ This benchmark aims to:
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 77047636bb95..a655a650cb32 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -368,7 +368,7 @@ def parse_client_command(cmd: str) -> dict[str, Any]:
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
- lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+ lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
)
# get markdown tables
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
index fb5063db8694..ebacdcbd6821 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@@ -181,18 +181,14 @@ launch_vllm_server() {
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
- server_command="python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="vllm serve $model \
-tp $tp \
- --model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
- server_command="python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="vllm serve $model \
-tp $tp \
- --model $model \
--port $port \
$server_args"
fi
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index b1b7d2d77a44..c64e5638029e 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -365,8 +365,7 @@ run_serving_tests() {
continue
fi
- server_command="$server_envs python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="$server_envs vllm serve \
$server_args"
# run the server
@@ -455,11 +454,6 @@ main() {
fi
check_hf_token
- # Set to v1 to run v1 benchmark
- if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
- export VLLM_USE_V1=1
- fi
-
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml
deleted file mode 100644
index d5cad1c73c6f..000000000000
--- a/.buildkite/pyproject.toml
+++ /dev/null
@@ -1,46 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
- # pycodestyle
- "E",
- # Pyflakes
- "F",
- # pyupgrade
- "UP",
- # flake8-bugbear
- "B",
- # flake8-simplify
- "SIM",
- # isort
- "I",
- # flake8-logging-format
- "G",
-]
-ignore = [
- # star imports
- "F405", "F403",
- # lambda expression assignment
- "E731",
- # Loop control variable not used within loop body
- "B007",
- # f-string format
- "UP032",
- # Can remove once 3.10+ is the minimum Python version
- "UP007",
-]
-
-[tool.ruff.format]
-docstring-code-format = true
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index a1de41652c9a..5bc59c151565 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,24 +1,36 @@
steps:
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build arm64 wheel - CUDA 12.9"
+ depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- - block: "Build CUDA 12.8 wheel"
- key: block-build-cu128-wheel
+ # aarch64 build.
+ - label: "Build arm64 CPU wheel"
+ depends_on: ~
+ id: build-wheel-arm64-cpu
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh"
+ env:
+ DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.8"
- depends_on: block-build-cu128-wheel
+ depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
@@ -30,12 +42,8 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build CUDA 12.6 wheel"
- key: block-build-cu126-wheel
- depends_on: ~
-
- label: "Build wheel - CUDA 12.6"
- depends_on: block-build-cu126-wheel
+ depends_on: ~
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
@@ -54,7 +62,7 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -82,7 +90,7 @@ steps:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
@@ -102,8 +110,6 @@ steps:
depends_on:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
- - build-wheel-cuda-12-6
- - build-wheel-cuda-12-9
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
@@ -150,6 +156,22 @@ steps:
env:
DOCKER_BUILDKIT: "1"
+ - block: "Build arm64 CPU release image"
+ key: block-arm64-cpu-release-image-build
+ depends_on: ~
+
+ - label: "Build and publish arm64 CPU release image"
+ depends_on: block-arm64-cpu-release-image-build
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+ env:
+ DOCKER_BUILDKIT: "1"
+
- label: "Build and publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
@@ -158,11 +180,16 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
- - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- - "docker push vllm/vllm-openai:nightly"
- - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+ - "docker push vllm/vllm-openai:nightly-x86_64"
+ - "docker push vllm/vllm-openai:nightly-aarch64"
+ - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+ - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+ - "docker manifest push vllm/vllm-openai:nightly"
+ - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
@@ -171,3 +198,4 @@ steps:
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
+ DOCKERHUB_USERNAME: "vllmbot"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 94e0ac2398f3..fde48603ad3c 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker push vllm/vllm-openai:latest
-docker push vllm/vllm-openai:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF
\ No newline at end of file
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
index 1a82f7d08523..f02a128c6772 100755
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -8,20 +8,41 @@ set -ex
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
-# Get DockerHub token from environment
+# Get DockerHub credentials from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
exit 1
fi
+if [ -z "$DOCKERHUB_USERNAME" ]; then
+ echo "Error: DOCKERHUB_USERNAME environment variable is not set"
+ exit 1
+fi
+
+# Get DockerHub bearer token
+echo "Getting DockerHub bearer token..."
+set +x
+BEARER_TOKEN=$(curl -s -X POST \
+ -H "Content-Type: application/json" \
+ -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
+ "https://hub.docker.com/v2/users/login" | jq -r '.token')
+set -x
+
+if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
+ echo "Error: Failed to get DockerHub bearer token"
+ exit 1
+fi
+
# Function to get all tags from DockerHub
get_all_tags() {
local page=1
local all_tags=""
while true; do
- local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+ set +x
+ local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
"$REPO_API_URL?page=$page&page_size=100")
+ set -x
# Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@@ -43,7 +64,9 @@ delete_tag() {
echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
- local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+ set +x
+ local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
+ set -x
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index c395011a2448..b2309d5ddea2 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -10,7 +10,7 @@ export PYTHONPATH=".."
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
- if grep -q clean /opt/amdgpu/etc/gpu_state; then
+ if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then
echo "GPUs state is \"clean\""
break
fi
@@ -49,18 +49,18 @@ cleanup_docker
echo "--- Resetting GPUs"
-echo "reset" > /opt/amdgpu/etc/gpu_state
+echo "reset" > ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}
while true; do
sleep 3
- if grep -q clean /opt/amdgpu/etc/gpu_state; then
+ if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container"
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"
@@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
- commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
@@ -167,12 +163,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 36bcb015d308..39ea18017308 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,25 +25,28 @@ function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
- set -e
- python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ set -xve
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
- set -e
+ set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
- pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+
+ # Note: disable Bart until supports V1
+ # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
- pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+ # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
+ # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
-timeout 40m bash -c cpu_tests
+timeout 120m bash -c cpu_tests
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 0f734763f13f..7927aef19e4e 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -58,15 +58,11 @@ function cpu_tests() {
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
- # Note: disable Bart until supports V1
- pytest -x -v -s tests/models/language/generation -m cpu_model \
- --ignore=tests/models/language/generation/test_bart.py
- VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
- --ignore=tests/models/language/generation/test_bart.py
+ pytest -x -v -s tests/models/language/generation -m cpu_model
+ VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \
- --ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
@@ -74,7 +70,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
- tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Note: disable it until supports V1
# Run AWQ test
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
new file mode 100644
index 000000000000..29c8f5ed5a91
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# This script build the Ascend NPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Base ubuntu image with basic ascend development libraries and python installed
+VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
+CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
+TEST_RUN_CONFIG_FILE="vllm_test.cfg"
+VLLM_ASCEND_TMP_DIR=
+# Get the test run configuration file from the vllm-ascend repository
+fetch_vllm_test_cfg() {
+ VLLM_ASCEND_TMP_DIR=$(mktemp -d)
+ # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
+ cleanup() {
+ rm -rf "${VLLM_ASCEND_TMP_DIR}"
+ }
+ trap cleanup EXIT
+
+ GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
+ if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
+ echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
+ exit 1
+ fi
+
+ # If the file already exists locally, just overwrite it
+ cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
+ echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
+
+ # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
+ # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
+ rm -rf "${VLLM_ASCEND_TMP_DIR}"
+ trap - EXIT
+}
+
+# Downloads test run configuration file from a remote URL.
+# Loads the configuration into the current script environment.
+get_config() {
+ if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
+ echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
+ exit 1
+ fi
+ source "${TEST_RUN_CONFIG_FILE}"
+ echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
+ return 0
+}
+
+# get test running configuration.
+fetch_vllm_test_cfg
+get_config
+# Check if the function call was successful. If not, exit the script.
+if [ $? -ne 0 ]; then
+ exit 1
+fi
+
+image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
+container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
+agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
+echo "agent_idx: ${agent_idx}"
+builder_name="cachebuilder${agent_idx}"
+builder_cache_dir="/mnt/docker-cache${agent_idx}"
+mkdir -p ${builder_cache_dir}
+
+# Try building the docker image
+cat <=6.0 modelscope
+
+WORKDIR /workspace/vllm
+
+# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+COPY requirements/common.txt /workspace/vllm/requirements/common.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -r requirements/common.txt
+
+COPY . .
+
+# Install vLLM
+RUN --mount=type=cache,target=/root/.cache/pip \
+ VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+ python3 -m pip uninstall -y triton
+
+# Install vllm-ascend
+WORKDIR /workspace
+ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
+ARG VLLM_ASCEND_TAG=main
+RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
+ git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
+
+# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -r /workspace/vllm-ascend/requirements.txt
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+ source /usr/local/Ascend/nnal/atb/set_env.sh && \
+ export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+ python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV VLLM_USE_MODELSCOPE=True
+
+WORKDIR /workspace/vllm-ascend
+
+CMD ["/bin/bash"]
+
+EOF
+
+# Setup cleanup
+remove_docker_container() {
+ docker rm -f "${container_name}" || true;
+ docker image rm -f "${image_name}" || true;
+ docker system prune -f || true;
+}
+trap remove_docker_container EXIT
+
+# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
+# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
+# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
+# returns --device /dev/davinci0 --device /dev/davinci1
+parse_and_gen_devices() {
+ local input="$1"
+ local index cards_num
+ if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
+ index="${BASH_REMATCH[1]}"
+ cards_num="${BASH_REMATCH[2]}"
+ else
+ echo "parse error" >&2
+ return 1
+ fi
+
+ local devices=""
+ local i=0
+ while (( i < cards_num )); do
+ local dev_idx=$(((index - 1)*cards_num + i ))
+ devices="$devices --device /dev/davinci${dev_idx}"
+ ((i++))
+ done
+
+ # trim leading space
+ devices="${devices#"${devices%%[![:space:]]*}"}"
+ # Output devices: assigned to the caller variable
+ printf '%s' "$devices"
+}
+
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+
+# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
+# This test checks whether the OOT platform interface is functioning properly in conjunction with
+# the hardware plugin vllm-ascend.
+model_cache_dir=/mnt/modelscope${agent_idx}
+mkdir -p ${model_cache_dir}
+docker run \
+ ${devices} \
+ --device /dev/davinci_manager \
+ --device /dev/devmm_svm \
+ --device /dev/hisi_hdc \
+ -v /usr/local/dcmi:/usr/local/dcmi \
+ -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+ -v /etc/ascend_install.info:/etc/ascend_install.info \
+ -v ${model_cache_dir}:/root/.cache/modelscope \
+ --entrypoint="" \
+ --name "${container_name}" \
+ "${image_name}" \
+ bash -c '
+ set -e
+ pytest -v -s tests/e2e/vllm_interface/
+'
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index 1073a4ee30af..cbb2527a4ff0 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
- && python3 -m pip install --progress-bar off hf-transfer
+ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 505664f3aecd..f022fa3672ee 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
- && python3 -m pip install --progress-bar off hf-transfer
+ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index efcd10acf0b9..250a64fdd071 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -30,20 +30,19 @@ docker run \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
+ pip install tblib==3.1.0
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
- VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+ VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
+ pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_metrics_reader.py
'
diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 72812218cb66..51536b36b808 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
-python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+vllm serve meta-llama/Llama-2-7b-chat-hf &
server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
new file mode 100755
index 000000000000..5b25c358fc4a
--- /dev/null
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+ echo "Removing existing Prime-RL directory..."
+ rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+ echo "Installing UV package manager..."
+ curl -LsSf https://astral.sh/uv/install.sh | sh
+ source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env
index bd25c803081a..ecb98d4516bd 100644
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=10.0
+EXPECTED_THROUGHPUT=8.7
INPUT_LEN=1800
OUTPUT_LEN=128
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index b1e17b438578..3364fce8e1fd 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,7 +42,7 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
-VLLM_USE_V1=1 vllm serve $MODEL \
+vllm serve $MODEL \
--seed 42 \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
new file mode 100644
index 000000000000..50b2b61124af
--- /dev/null
+++ b/.buildkite/test-amd.yaml
@@ -0,0 +1,1267 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
+# to generate the final pipeline yaml file.
+
+# Documentation
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
+# command(str): the single command to run for tests. incompatible with commands.
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+# in this case, commands must be specified. the first command runs on the first host, the second
+# command runs on the second host.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+
+# When adding a test
+# - If the test belongs to an existing group, add it there
+# - If the test is short, add to any existing step
+# - If the test takes more than 10min, then it is okay to create a new step.
+# Note that all steps execute in parallel.
+
+steps:
+##### fast check tests #####
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+ # if this test fails, it means the nightly torch version is not compatible with some
+ # of the dependencies. Please check the error message and add the package to whitelist
+ # in /vllm/tools/generate_nightly_torch_test.py
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ soft_fail: true
+ source_file_dependencies:
+ - requirements/nightly_torch_test.txt
+ commands:
+ - bash standalone_tests/pytorch_nightly_dependency.sh
+
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+ timeout_in_minutes: 10
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/test_inputs.py
+ - tests/test_outputs.py
+ - tests/multimodal
+ - tests/standalone_tests/lazy_imports.py
+ - tests/transformers_utils
+ no_gpu: true
+ commands:
+ - python3 standalone_tests/lazy_imports.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s test_outputs.py
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s transformers_utils
+
+- label: Python-only Installation Test # 10min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - tests/standalone_tests/python_only_compile.sh
+ - setup.py
+ commands:
+ - bash standalone_tests/python_only_compile.sh
+
+- label: Basic Correctness Test # 20min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/basic_correctness/test_basic_correctness
+ - tests/basic_correctness/test_cpu_offload
+ - tests/basic_correctness/test_cumem.py
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s basic_correctness/test_cumem.py
+ - pytest -v -s basic_correctness/test_basic_correctness.py
+ - pytest -v -s basic_correctness/test_cpu_offload.py
+
+- label: Entrypoints Unit Tests # 5min
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration Test (LLM) # 30min
+ timeout_in_minutes: 40
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/llm
+ - tests/entrypoints/offline_mode
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration Test (API Server) # 100min
+ timeout_in_minutes: 130
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai
+ - tests/entrypoints/test_chat_utils
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+ - pytest -v -s entrypoints/test_chat_utils.py
+
+- label: Entrypoints Integration Test (Pooling)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - tests/distributed/test_utils
+ - tests/distributed/test_pynccl
+ - tests/distributed/test_events
+ - tests/compile/test_basic_correctness
+ - examples/offline_inference/rlhf.py
+ - examples/offline_inference/rlhf_colocate.py
+ - tests/examples/offline_inference/data_parallel.py
+ - tests/v1/distributed
+ - tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
+ commands:
+ # test with torchrun tp=2 and external_dp=2
+ - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and pp=2
+ - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with internal dp
+ - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+ - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+ - pytest -v -s distributed/test_utils.py
+ - pytest -v -s compile/test_basic_correctness.py
+ - pytest -v -s distributed/test_pynccl.py
+ - pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
+ # TODO: create a dedicated test section for multi-GPU example tests
+ # when we have multiple distributed example tests
+ - pushd ../examples/offline_inference
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+ - popd
+
+- label: EPLB Algorithm Test # 5min
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_algo.py
+ commands:
+ - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution Test # 5min
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_execute.py
+ commands:
+ - pytest -v -s distributed/test_eplb_execute.py
+
+- label: Metrics, Tracing Test # 12min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_2
+ # grade: Blocking
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/tracing
+ commands:
+ - "pip install \
+ 'opentelemetry-sdk>=1.26.0' \
+ 'opentelemetry-api>=1.26.0' \
+ 'opentelemetry-exporter-otlp>=1.26.0' \
+ 'opentelemetry-semantic-conventions-ai>=0.4.1'"
+ - pytest -v -s v1/tracing
+
+##### fast check tests #####
+##### 1 GPU test #####
+
+- label: Regression Test # 7min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/test_regression
+ commands:
+ - pip install modelscope
+ - pytest -v -s test_regression.py
+ working_dir: "/vllm-workspace/tests" # optional
+
+- label: Engine Test # 25min
+ timeout_in_minutes: 40
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ #grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/engine
+ - tests/tokenization
+ - tests/test_sequence
+ - tests/test_config
+ - tests/test_logger
+ - tests/test_vllm_port
+ commands:
+ - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+ # OOM in the CI unless we run this separately
+ - pytest -v -s tokenization
+
+- label: V1 Test e2e + engine # 30min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints # 35min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: V1 Test others # 42min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'not cpu_test' v1/core
+ - pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
+ - pytest -v -s v1/sample
+ - pytest -v -s v1/logits_processors
+ - pytest -v -s v1/worker
+ - pytest -v -s v1/spec_decode
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
+ - pytest -v -s v1/test_oracle.py
+ - pytest -v -s v1/test_request.py
+ # Integration test for streaming correctness (requires special branch).
+ - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+ - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Test others (CPU) # 5 mins
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+
+- label: Examples Test # 30min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/examples"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - examples/
+ commands:
+ - pip install tensorizer # for tensorizer test
+ - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+ - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+ - python3 offline_inference/basic/chat.py
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_pooling.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+
+- label: Platform Tests (CUDA) # 4min
+ timeout_in_minutes: 15
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/cuda
+ commands:
+ - pytest -v -s cuda/test_cuda_context.py
+
+- label: Samplers Test # 56min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/model_executor/layers
+ - vllm/sampling_metadata.py
+ - tests/samplers
+ - tests/conftest.py
+ commands:
+ - pytest -v -s samplers
+ - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+
+- label: LoRA Test %N # 20min each
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - pytest -v -s lora \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --ignore=lora/test_chatglm3_tp.py \
+ --ignore=lora/test_llama_tp.py \
+ --ignore=lora/test_llm_with_multi_loras.py
+ parallelism: 4
+
+- label: PyTorch Compilation Unit Tests # 15min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ - pytest -v -s compile/test_pass_manager.py
+ - pytest -v -s compile/test_fusion.py
+ - pytest -v -s compile/test_fusion_attn.py
+ - pytest -v -s compile/test_functionalization.py
+ - pytest -v -s compile/test_silu_mul_quant_fusion.py
+ - pytest -v -s compile/test_sequence_parallelism.py
+ - pytest -v -s compile/test_async_tp.py
+ - pytest -v -s compile/test_fusion_all_reduce.py
+ - pytest -v -s compile/test_decorator.py
+ - pytest -v -s compile/test_noop_elimination.py
+ - pytest -v -s compile/test_aot_compile.py
+
+- label: PyTorch Fullgraph Smoke Test # 15min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ - pytest -v -s compile/test_basic_correctness.py
+ - pytest -v -s compile/piecewise/
+
+- label: PyTorch Fullgraph Test # 20min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ - pytest -v -s compile/test_full_graph.py
+
+- label: Kernels Core Operation Test # 48min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - tests/kernels/core
+ commands:
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N # 23min
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/attention/
+ - vllm/attention
+ - vllm/v1/attention
+ - tests/kernels/attention
+ commands:
+ - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Quantization Test %N # 64min
+ timeout_in_minutes: 90
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/quantization/
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization
+ commands:
+ - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels MoE Test %N # 40min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/quantization/cutlass_w8a8/moe/
+ - csrc/moe/
+ - tests/kernels/moe
+ - vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
+ commands:
+ - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Mamba Test # 31min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/mamba/
+ - tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
+ commands:
+ - pytest -v -s kernels/mamba
+
+- label: Model Executor Test # 23min
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/model_executor
+ - tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+ commands:
+ - apt-get update && apt-get install -y curl libsodium23
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+
+- label: Benchmarks # 11min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_8
+ # grade: Blocking
+ working_dir: "/vllm-workspace/.buildkite"
+ source_file_dependencies:
+ - benchmarks/
+ commands:
+ - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test # 7min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/benchmarks/
+ commands:
+ - pytest -v -s benchmarks/
+
+- label: Quantization Test # 70min
+ timeout_in_minutes: 90
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ - tests/quantization
+ commands:
+ # temporary install here since we need nightly, will move to requirements/test.in
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+
+- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: OpenAI API correctness # 22min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/entrypoints/openai/
+ - vllm/model_executor/models/whisper.py
+ commands: # LMEval+Transcription WER check
+ - pytest -s entrypoints/openai/correctness/
+
+- label: OpenAI-Compatible Tool Use # 23 min
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ fast_check: false
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ commands:
+ - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ no_gpu: true
+ commands:
+ - pytest -v -s -m 'cpu_test' tool_use
+
+##### models test #####
+
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_initialization.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_8
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py \
+ -k 'not test_can_initialize_small_subset' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 10
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language
+ commands:
+ # Test standard language models, excluding a subset of slow tests
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation \
+ -m hybrid_model \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+ - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+ - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling
+ commands:
+ - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing
+
+- label: Multi-Modal Models Test (Standard) # 60min
+ timeout_in_minutes: 80
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Models Test (Extended) 1
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models Test (Extended) 2
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+- label: Quantized Models Test # 45 min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/model_executor/layers/quantization
+ - tests/models/quantization
+ commands:
+ - pytest -v -s models/quantization
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ commands:
+ - echo 'Testing custom models...'
+ # PR authors can temporarily add commands below to test individual models
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
+
+- label: Transformers Nightly Models Test
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ working_dir: "/vllm-workspace/"
+ optional: true
+ commands:
+ - pip install --upgrade git+https://github.com/huggingface/transformers
+ - pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
+ - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/test_mapping.py
+ - python3 examples/offline_inference/basic/chat.py
+ - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+
+- label: Blackwell Test # 38 min
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ # optional: true
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - csrc/attention/mla/
+ - csrc/quantization/cutlass_w8a8/moe/
+ - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/fusion.py
+ - vllm/compilation/fusion_attn.py
+ commands:
+ - nvidia-smi
+ - python3 examples/offline_inference/basic/chat.py
+ # Attention
+ # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+ - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+ # Quantization
+ - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+ # Fusion
+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
+ - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+
+- label: Blackwell GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Blackwell Quantized MoE Test
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Blackwell LM Eval Small Models
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
+##### 1 GPU test #####
+##### multi gpus test #####
+
+- label: Distributed Comm Ops Test # 7min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/distributed
+ - tests/distributed
+ commands:
+ - pytest -v -s distributed/test_comm_ops.py
+ - pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
+
+- label: 2 Node Tests (4 GPUs in total) # 16min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ num_nodes: 2
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ - tests/examples/offline_inference/data_parallel.py
+ commands:
+ - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+ - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+ - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+ - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+ - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+ - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+ - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+ - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+ - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+ - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+
+- label: Distributed Tests (2 GPUs) # 68min
+ timeout_in_minutes: 90
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/compilation/
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/worker/worker_base.py
+ - vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
+ commands:
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+ - pytest -v -s entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s ./compile/test_basic_correctness.py
+ - pytest -v -s ./compile/test_wrapper.py
+ - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Model Tests (2 GPUs) # 37min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+ # Avoid importing model tests that cause CUDA reinitialization error
+ - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
+
+- label: Plugin Tests (2 GPUs) # 40min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/plugins/
+ - tests/plugins/
+ commands:
+ # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+ - pip install -e ./plugins/vllm_add_dummy_platform
+ - pytest -v -s plugins_tests/test_platform_plugins.py
+ - pip uninstall vllm_add_dummy_platform -y
+ # end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
+ # other tests continue here:
+ - pytest -v -s plugins_tests/test_scheduler_plugins.py
+ - pip install -e ./plugins/vllm_add_dummy_model
+ - pytest -v -s distributed/test_distributed_oot.py
+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+ - pytest -v -s models/test_oot_registration.py # it needs a clean process
+ - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+
+- label: Pipeline + Context Parallelism Test # 45min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ commands:
+ - pytest -v -s distributed/test_pp_cudagraph.py
+ - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: LoRA TP Test (Distributed) # 17 min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ # FIXIT: find out which code initialize cuda before running the test
+ # before the fix, we need to use spawn to test it
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # There is some Tensor Parallelism related processing logic in LoRA that
+ # requires multi-GPU testing for validation.
+ - pytest -v -s -x lora/test_chatglm3_tp.py
+ - pytest -v -s -x lora/test_llama_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
+
+
+- label: Weight Loading Multiple GPU Test # 33min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ gpu: a100
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+
+
+##### multi gpus test #####
+##### A100 test #####
+
+- label: Distributed Tests (A100) # optional
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/
+ commands:
+ # NOTE: don't test llama model here, it seems hf implementation is buggy
+ # see https://github.com/vllm-project/vllm/pull/5689 for details
+ - pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - pytest -v -s -x lora/test_mixtral.py
+
+- label: LM Eval Large Models # optional
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H200 test #####
+- label: Distrubted Tests (H200) # optional
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b0d4c4456d33..a28e333eac69 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -6,24 +6,28 @@
# to generate the final pipeline yaml file.
# Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
-# in this case, commands must be specified. the first command runs on first host, the second
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+# in this case, commands must be specified. the first command runs on the first host, the second
# command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# When adding a test
-# - If the test belong to an existing group, add it there
+# - If the test belongs to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
@@ -46,23 +50,28 @@ steps:
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- - tests/mq_llm_engine
- - tests/async_engine
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- - tests/utils_
- - tests/worker
- tests/standalone_tests/lazy_imports.py
+ - tests/transformers_utils
+ no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- - pytest -v -s mq_llm_engine # MQLLMEngine
- - pytest -v -s async_engine # AsyncLLMEngine
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- - pytest -v -s multimodal
- - pytest -v -s utils_ # Utils
- - pytest -v -s worker # Worker
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s transformers_utils
- label: Python-only Installation Test # 10min
timeout_in_minutes: 20
@@ -82,27 +91,25 @@ steps:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- - tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-- label: Core Test # 22min
- timeout_in_minutes: 35
- mirror_hardwares: [amdexperimental]
+- label: Entrypoints Unit Tests # 5min
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
fast_check: true
source_file_dependencies:
- - vllm/core
- - vllm/distributed
- - tests/core
+ - vllm/entrypoints
+ - tests/entrypoints/
commands:
- - pytest -v -s core
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-- label: Entrypoints Test (LLM) # 30min
+- label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
@@ -114,12 +121,11 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-- label: Entrypoints Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server) # 100min
timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
@@ -132,9 +138,22 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
+- label: Entrypoints Integration Test (Pooling)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
- label: Distributed Tests (4 GPUs) # 35min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@@ -142,7 +161,6 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- - vllm/core/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
@@ -150,28 +168,34 @@ steps:
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- - tests/v1/test_async_llm_dp.py
- - tests/v1/test_external_lb_dp.py
- - tests/v1/test_internal_lb_dp.py
- - tests/v1/test_hybrid_lb_dp.py
+ - tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
commands:
- # test with tp=2 and external_dp=2
- - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- # test with tp=2 and pp=2
+ # test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
- - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
- - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- pushd ../examples/offline_inference
@@ -204,16 +228,14 @@ steps:
num_gpus: 2
source_file_dependencies:
- vllm/
- - tests/metrics
- - tests/tracing
+ - tests/v1/tracing
commands:
- - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- - pytest -v -s tracing
+ - pytest -v -s v1/tracing
##### fast check tests #####
##### 1 GPU test #####
@@ -274,23 +296,35 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- - pytest -v -s v1/core
+ - pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- - pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode
- - pytest -v -s v1/kv_connector/unit
- - pytest -v -s v1/metrics
- - pytest -v -s v1/test_serial_utils.py
- - pytest -v -s v1/test_utils.py
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- - pytest -v -s v1/test_metrics_reader.py
+ - pytest -v -s v1/test_request.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+- label: V1 Test others (CPU) # 5 mins
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+
- label: Examples Test # 30min
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
@@ -309,13 +343,13 @@ steps:
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15
@@ -364,11 +398,12 @@ steps:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
+ - pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- - pytest -v -s compile/test_sequence_parallelism.py
- - pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
+ - pytest -v -s compile/test_noop_elimination.py
+ - pytest -v -s compile/test_aot_compile.py
- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
@@ -379,14 +414,10 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- # these tests need to be separated, cannot combine
- - pytest -v -s compile/piecewise/test_simple.py
- - pytest -v -s compile/piecewise/test_toy_llama.py
- - pytest -v -s compile/piecewise/test_full_cudagraph.py
- - pytest -v -s compile/piecewise/test_multiple_graphs.py
+ - pytest -v -s compile/piecewise/
-- label: PyTorch Fullgraph Test # 20min
- timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 22min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -394,6 +425,7 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py
+ - pytest -v -s compile/test_fusions_e2e.py
- label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75
@@ -401,8 +433,9 @@ steps:
source_file_dependencies:
- csrc/
- tests/kernels/core
+ - tests/kernels/test_top_k_per_row.py
commands:
- - pytest -v -s kernels/core
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
- label: Kernels Attention Test %N # 23min
timeout_in_minutes: 35
@@ -446,32 +479,22 @@ steps:
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
-- label: Tensorizer Test # 14min
- timeout_in_minutes: 25
- mirror_hardwares: [amdexperimental]
- source_file_dependencies:
- - vllm/model_executor/model_loader
- - tests/tensorizer_loader
- - tests/entrypoints/openai/test_tensorizer_entrypoint.py
- commands:
- - apt-get update && apt-get install -y curl libsodium23
- - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s tensorizer_loader
- - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-
-- label: Model Executor Test # 7min
- timeout_in_minutes: 20
+- label: Model Executor Test # 23min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Benchmarks # 11min
timeout_in_minutes: 20
@@ -501,8 +524,13 @@ steps:
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release, and pin a working version of torchao nightly here
- - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
timeout_in_minutes: 75
@@ -523,15 +551,6 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
-- label: Encoder Decoder tests # 12min
- timeout_in_minutes: 20
- mirror_hardwares: [amdexperimental]
- source_file_dependencies:
- - vllm/
- - tests/encoder_decoder
- commands:
- - pytest -v -s encoder_decoder
-
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
@@ -539,43 +558,105 @@ steps:
source_file_dependencies:
- vllm/
- tests/tool_use
- - tests/mistral_tool_use
commands:
- - pytest -v -s tool_use
- - pytest -v -s mistral_tool_use
+ - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ no_gpu: true
+ commands:
+ - pytest -v -s -m 'cpu_test' tool_use
##### models test #####
-- label: Basic Models Test # 57min
- timeout_in_minutes: 75
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- - tests/models
+ - tests/models/test_initialization.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
commands:
- - pytest -v -s models/test_transformers.py
- - pytest -v -s models/test_registry.py
- - pytest -v -s models/test_utils.py
- - pytest -v -s models/test_vision.py
- - pytest -v -s models/test_initialization.py
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py \
+ -k 'not test_can_initialize_small_subset' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
-- label: Language Models Test (Standard) # 35min
+- label: Basic Models Tests (Other)
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ timeout_in_minutes: 10
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
- tests/models/language
commands:
+ # Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- - pytest -v -s models/language -m core_model
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
-- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
@@ -583,7 +664,12 @@ steps:
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- - pytest -v -s models/language/generation -m hybrid_model
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation \
+ -m hybrid_model \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
@@ -597,6 +683,16 @@ steps:
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
- label: Language Models Test (Extended Pooling) # 36min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@@ -607,6 +703,16 @@ steps:
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
+
- label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60
source_file_dependencies:
@@ -627,7 +733,17 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+ timeout_in_minutes: 70
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - vllm/multimodal/
+ - vllm/inputs/
+ - vllm/v1/core/
+ commands:
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
@@ -684,14 +800,16 @@ steps:
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- - python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
-- label: Blackwell Test # 38 min
- timeout_in_minutes: 60
+- label: Blackwell Test # 21 min
+ timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
@@ -704,8 +822,6 @@ steps:
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- - vllm/compilation/fusion.py
- - vllm/compilation/fusion_attn.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
@@ -713,21 +829,82 @@ steps:
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
- # Fusion
- - pytest -v -s tests/compile/test_fusion_all_reduce.py
- - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
+
+- label: Blackwell Fusion Tests # 30 min
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ commands:
+ - nvidia-smi
+ - pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+ # this runner has 2 GPUs available even though num_gpus=2 is not set
+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
+ - pytest -v -s tests/compile/test_fusions_e2e.py
+
+- label: Blackwell GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Blackwell Quantized MoE Test
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Blackwell LM Eval Small Models
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
@@ -743,6 +920,8 @@ steps:
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
- label: 2 Node Tests (4 GPUs in total) # 16min
timeout_in_minutes: 30
@@ -769,46 +948,58 @@ steps:
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-- label: Distributed Tests (2 GPUs) # 110min
- timeout_in_minutes: 150
+- label: Distributed Tests (2 GPUs) # 68min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
+ - vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- - vllm/model_executor/models/
- - tests/distributed/
- - vllm/compilation
- vllm/worker/worker_base.py
- - vllm/worker/worker.py
- - vllm/worker/model_runner.py
- - entrypoints/llm/test_collective_rpc.py
- - tests/v1/test_async_llm_dp.py
- - tests/v1/test_external_lb_dp.py
- - tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
commands:
- - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Model Tests (2 GPUs) # 37min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
- # test sequence parallel
- - pytest -v -s distributed/test_sequence_parallel.py
- # this test fails consistently.
- # TODO: investigate and fix
- - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- - pytest -v -s models/multimodal/generation/test_maverick.py
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
- label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60
@@ -827,8 +1018,13 @@ steps:
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- - pip uninstall prithvi_io_processor_plugin -y
+ - pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
+ # begin stat_logger plugins test
+ - pip install -e ./plugins/vllm_add_dummy_stat_logger
+ - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+ - pip uninstall dummy_stat_logger -y
+ # end stat_logger plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
@@ -851,7 +1047,6 @@ steps:
commands:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
- # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
- label: LoRA TP Test (Distributed) # 17 min
timeout_in_minutes: 30
@@ -875,7 +1070,7 @@ steps:
timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
- num_gpus: 2
+ num_gpus: 2
optional: true
source_file_dependencies:
- vllm/
@@ -894,6 +1089,17 @@ steps:
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
##### multi gpus test #####
@@ -925,9 +1131,38 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-- label: Qwen MoE EP Test # optional
+##### H200 test #####
+- label: Distributed Tests (H200) # optional
gpu: h200
optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/compile/test_async_tp.py
+ - pytest -v -s tests/compile/test_sequence_parallelism.py
+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
+ - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
num_gpus: 2
commands:
- - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
new file mode 100644
index 000000000000..0d8b6d0a4f93
--- /dev/null
+++ b/.buildkite/test-template.j2
@@ -0,0 +1,47 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "vllm/tests" %}
+{% set hf_home = "/root/.cache/huggingface" %}
+
+steps:
+ - label: ":docker: build image"
+ depends_on: ~
+ commands:
+ - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f docker/Dockerfile.rocm --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --target test --progress plain ."
+ - "docker push {{ docker_image_amd }}"
+ key: "amd-build"
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
+ agents:
+ queue: amd-cpu
+ soft_fail: false
+
+{% for step in steps %}
+{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+ - label: "AMD: {{ step.label }}"
+ depends_on:
+ - "amd-build"
+ agents:
+{% if step.amd_gpus and step.amd_gpus==8%}
+ queue: amd_gpu
+{% elif step.amd_gpus and step.amd_gpus==4%}
+ queue: amd_gpu
+{% elif step.amd_gpus and step.amd_gpus==2%}
+ queue: amd_gpu
+{% else%}
+ queue: amd_gpu
+{% endif%}
+ commands:
+ - bash .buildkite/scripts/hardware_ci/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}"
+ env:
+ DOCKER_BUILDKIT: "1"
+ priority: 100
+ soft_fail: false
+{% endif %}
+{% endfor %}
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 000000000000..b7a9fdb4e05a
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,47 @@
+[run]
+# Track the installed vllm package (this is what actually gets imported during tests)
+# Use wildcard pattern to match the installed location
+source =
+ vllm
+ */dist-packages/vllm
+ */site-packages/vllm
+omit =
+ */tests/*
+ */test_*
+ */__pycache__/*
+ */build/*
+ */dist/*
+ */vllm.egg-info/*
+ */third_party/*
+ */examples/*
+ */benchmarks/*
+ */docs/*
+
+[paths]
+# Map all possible vllm locations to a canonical "vllm" path
+# This ensures coverage.combine properly merges data from different test runs
+source =
+ vllm
+ /vllm-workspace/src/vllm
+ /vllm-workspace/vllm
+ */site-packages/vllm
+ */dist-packages/vllm
+
+[report]
+exclude_lines =
+ pragma: no cover
+ def __repr__
+ if self.debug:
+ if settings.DEBUG
+ raise AssertionError
+ raise NotImplementedError
+ if 0:
+ if __name__ == .__main__.:
+ class .*\bProtocol\):
+ @(abc\.)?abstractmethod
+
+[html]
+directory = htmlcov
+
+[xml]
+output = coverage.xml
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000000..5a601d00cef8
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,4 @@
+# Migrate from `yapf` & `isort` to `ruff`
+d6953beb91da4e9c99be4c0a1304a2d24189535c
+# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
+8fcaaf6a165e661f63fc51be906bc05b0767332f
diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml
new file mode 100644
index 000000000000..443dfa45af22
--- /dev/null
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+# - "vllm/v1/attetion/*.py"
+# - "vllm/v1/core/*.py"
+exclude:
+ - "**/*.py"
+
+scan:
+ functions: true # check free functions and methods
+ classes: true # check classes/dataclasses
+ public_only: true # ignore names starting with "_" at any level
+
+annotations:
+ include: # decorators that force‑include a symbol
+ - name: "bc_linter_include" # matched by simple name or dotted suffix
+ propagate_to_members: false # for classes, include methods/inner classes
+ exclude: # decorators that force‑exclude a symbol
+ - name: "bc_linter_skip" # matched by simple name or dotted suffix
+ propagate_to_members: true # for classes, exclude methods/inner classes
+
+excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b6b3e184bff2..024bdf2526df 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,93 +1,7 @@
# See https://help.github.com/articles/about-codeowners/
# for more info about CODEOWNERS file
-# This lists cover the "core" components of vLLM that require careful review
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
-/vllm/model_executor/layers/mamba @tdoublep
-/vllm/model_executor/model_loader @22quinn
-/vllm/multimodal @DarkLight1337 @ywang96
-/vllm/v1/sample @22quinn @houseroad
-/vllm/vllm_flash_attn @LucasWilkinson
-/vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
-/vllm/compilation @zou3519 @youkaichao @ProExpertProg
-CMakeLists.txt @tlrmchlsmth @LucasWilkinson
+* @wuhuikx @zejunchen-zejun @tjtanaavllm @kliuae-amd
-# Any change to the VllmConfig changes can have a large user-facing impact,
-# so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-
-# vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
-/vllm/v1/spec_decode @benchislett @luccafong
-/vllm/v1/attention/backends/triton_attn.py @tdoublep
-
-# Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/distributed/test_multi_node_assignment.py @youkaichao
-/tests/distributed/test_pipeline_parallel.py @youkaichao
-/tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
-/tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
-/tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
-/tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/weight_loading @mgoin @youkaichao @yewentao256
-/tests/lora @jeejeelee
-/tests/models/language/generation/test_hybrid.py @tdoublep
-
-# Docs
-/docs @hmellor
-mkdocs.yaml @hmellor
-
-# CPU
-/vllm/v1/worker/^cpu @bigPYJ1151
-/csrc/cpu @bigPYJ1151
-/vllm/platforms/cpu.py @bigPYJ1151
-/cmake/cpu_extension.cmake @bigPYJ1151
-/docker/Dockerfile.cpu @bigPYJ1151
-
-# Intel GPU
-/vllm/v1/worker/^xpu @jikunshang
-/vllm/platforms/xpu.py @jikunshang
-/docker/Dockerfile.xpu @jikunshang
-
-# Qwen-specific files
-/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
-/vllm/model_executor/models/qwen* @sighingnow
-
-# MTP-specific files
-/vllm/model_executor/models/deepseek_mtp.py @luccafong
-
-# Mistral-specific files
-/vllm/model_executor/models/mistral*.py @patrickvonplaten
-/vllm/model_executor/models/mixtral*.py @patrickvonplaten
-/vllm/model_executor/models/voxtral*.py @patrickvonplaten
-/vllm/model_executor/models/pixtral*.py @patrickvonplaten
-/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
-/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
-
-# Kernels
-/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
-/vllm/attention/ops/triton_unified_attention.py @tdoublep
-
-# ROCm related: specify owner with write access to notify AMD folks for careful code review
-/docker/Dockerfile.rocm* @gshtras
-/vllm/v1/attention/backends/rocm*.py @gshtras
-/vllm/v1/attention/backends/mla/rocm*.py @gshtras
-/vllm/attention/ops/rocm*.py @gshtras
-/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+/csrc/ @wuhuikx @zejunchen-zejun @tjtanaavllm @kliuae-amd
+/vllm/ @wuhuikx @zejunchen-zejun @tjtanaavllm @kliuae-amd
diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml
index 7ee57c42895c..c0e009855964 100644
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -43,10 +43,6 @@ body:
Any other things you would like to mention.
validations:
required: false
-- type: markdown
- attributes:
- value: >
- Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes
id: askllm
attributes:
diff --git a/.github/mergify.yml b/.github/mergify.yml
index befad23da866..de1a8314a4ec 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -2,6 +2,7 @@ pull_request_rules:
- name: label-documentation
description: Automatically apply documentation label
conditions:
+ - label != stale
- or:
- files~=^[^/]+\.md$
- files~=^docs/
@@ -10,10 +11,13 @@ pull_request_rules:
label:
add:
- documentation
+ comment:
+ message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
+ - label != stale
- or:
- files~=^\.github/
- files~=\.buildkite/
@@ -30,6 +34,7 @@ pull_request_rules:
- name: label-deepseek
description: Automatically apply deepseek label
conditions:
+ - label != stale
- or:
- files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py
@@ -46,6 +51,7 @@ pull_request_rules:
- name: label-frontend
description: Automatically apply frontend label
conditions:
+ - label != stale
- files~=^vllm/entrypoints/
actions:
label:
@@ -55,6 +61,7 @@ pull_request_rules:
- name: label-llama
description: Automatically apply llama label
conditions:
+ - label != stale
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
@@ -70,6 +77,7 @@ pull_request_rules:
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
+ - label != stale
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
@@ -83,6 +91,7 @@ pull_request_rules:
- name: label-new-model
description: Automatically apply new-model label
conditions:
+ - label != stale
- and:
- files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py
@@ -94,6 +103,7 @@ pull_request_rules:
- name: label-performance
description: Automatically apply performance label
conditions:
+ - label != stale
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
@@ -107,6 +117,7 @@ pull_request_rules:
- name: label-qwen
description: Automatically apply qwen label
conditions:
+ - label != stale
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
@@ -121,12 +132,20 @@ pull_request_rules:
- name: label-gpt-oss
description: Automatically apply gpt-oss label
conditions:
+ - label != stale
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
+ - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
+ - files~=^tests/entrypoints/test_context.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
+ - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/tool_server.py
+ - files~=^vllm/entrypoints/tool.py
+ - files~=^vllm/entrypoints/context.py
- title~=(?i)gpt[-_]?oss
+ - title~=(?i)harmony
actions:
label:
add:
@@ -135,6 +154,7 @@ pull_request_rules:
- name: label-rocm
description: Automatically apply rocm label
conditions:
+ - label != stale
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
@@ -155,6 +175,7 @@ pull_request_rules:
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
+ - label != stale
- or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
@@ -164,7 +185,7 @@ pull_request_rules:
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/
- - files=tests/v1/entrypoints/llm/test_guided_generate.py
+ - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@@ -174,6 +195,7 @@ pull_request_rules:
- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
+ - label != stale
- or:
- files~=^vllm/v1/spec_decode/
- files~=^tests/v1/spec_decode/
@@ -189,6 +211,7 @@ pull_request_rules:
- name: label-v1
description: Automatically apply v1 label
conditions:
+ - label != stale
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
@@ -201,6 +224,7 @@ pull_request_rules:
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
+ - label != stale
- or:
- files~=tpu.py
- files~=_tpu
@@ -216,6 +240,7 @@ pull_request_rules:
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
+ - label != stale
- and:
- -files~=tpu.py
- -files~=_tpu
@@ -230,9 +255,9 @@ pull_request_rules:
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
+ - label != stale
- or:
- files~=^tests/tool_use/
- - files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
@@ -249,8 +274,9 @@ pull_request_rules:
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- - conflict
- - -closed
+ - label != stale
+ - conflict
+ - -closed
actions:
label:
add:
@@ -264,10 +290,12 @@ pull_request_rules:
- name: assign reviewer for tensorizer changes
conditions:
+ - label != stale
+ - or:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- - files~=^tests/tensorizer_loader/
+ - files~=^tests/model_executor/model_loader/tensorizer_loader/
actions:
assign:
users:
@@ -275,6 +303,7 @@ pull_request_rules:
- name: assign reviewer for modelopt changes
conditions:
+ - label != stale
- or:
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
@@ -289,9 +318,27 @@ pull_request_rules:
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- - -conflict
- - -closed
+ - -conflict
+ - -closed
actions:
label:
remove:
- needs-rebase
+
+- name: label-kv-connector
+ description: Automatically apply kv-connector label
+ conditions:
+ - label != stale
+ - or:
+ - files~=^examples/online_serving/disaggregated[^/]*/.*
+ - files~=^examples/offline_inference/disaggregated[^/]*/.*
+ - files~=^examples/others/lmcache/
+ - files~=^tests/v1/kv_connector/
+ - files~=^vllm/distributed/kv_transfer/
+ - title~=(?i)\bP/?D\b
+ - title~=(?i)NIXL
+ - title~=(?i)LMCache
+ actions:
+ label:
+ add:
+ - kv-connector
\ No newline at end of file
diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml
new file mode 100644
index 000000000000..823695a92132
--- /dev/null
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,29 @@
+name: BC Lint
+
+on:
+ pull_request:
+ types:
+ - opened
+ - synchronize
+ - reopened
+ - labeled
+ - unlabeled
+
+jobs:
+ bc_lint:
+ if: github.repository_owner == 'vllm-project'
+ runs-on: ubuntu-latest
+ steps:
+ - name: Run BC Lint Action
+ uses: pytorch/test-infra/.github/actions/bc-lint@main
+ with:
+ repo: ${{ github.event.pull_request.head.repo.full_name }}
+ base_sha: ${{ github.event.pull_request.base.sha }}
+ head_sha: ${{ github.event.pull_request.head.sha }}
+ suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+ docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+ config_dir: .github
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+ cancel-in-progress: true
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index c2b17abe811c..7d565ef9f2e4 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -13,6 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
+ id: label-step
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
@@ -42,7 +43,6 @@ jobs:
searchIn: "body"
},
],
-
// Substring search - matches anywhere in text (partial matches)
substrings: [
{
@@ -89,14 +89,12 @@ jobs:
term: "hip_",
searchIn: "both"
},
-
// ROCm tools and libraries
{
term: "hipify",
searchIn: "both"
},
],
-
// Regex patterns - for complex pattern matching
regexPatterns: [
{
@@ -107,13 +105,17 @@ jobs:
}
],
},
+ // Add more label configurations here as needed
+ // example: {
+ // keywords: [...],
+ // substrings: [...],
+ // regexPatterns: [...]
+ // },
};
-
// Helper function to create regex based on search type
function createSearchRegex(term, type) {
// Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-
switch (type) {
case 'keyword':
// Word boundary search - matches whole words only
@@ -125,16 +127,13 @@ jobs:
throw new Error(`Unknown search type: ${type}`);
}
}
-
// Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = [];
const lines = text.split('\n');
-
for (const termConfig of searchTerms) {
let regex;
let term, searchIn, pattern, description, flags;
-
// Handle different input formats (string or object)
if (typeof termConfig === 'string') {
term = termConfig;
@@ -146,21 +145,17 @@ jobs:
description = termConfig.description;
flags = termConfig.flags;
}
-
// Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) {
continue;
}
-
// Create appropriate regex
if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi");
} else {
regex = createSearchRegex(term, searchType);
}
-
const termMatches = [];
-
// Check each line for matches
lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex);
@@ -175,15 +170,14 @@ jobs:
originalTerm: term || pattern,
description: description,
// Show context around the match in the line
- context: line.length > 100 ?
- line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
- line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
+ context: line.length > 100 ?
+ line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+ line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim()
});
});
}
});
-
if (termMatches.length > 0) {
matches.push({
term: term || (description || pattern),
@@ -196,64 +190,48 @@ jobs:
});
}
}
-
return matches;
}
-
// Helper function to check if label should be added
async function processLabel(labelName, config) {
const body = context.payload.issue.body || "";
const title = context.payload.issue.title || "";
-
core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`);
-
let shouldAddLabel = false;
let allMatches = [];
let reason = '';
-
const keywords = config.keywords || [];
const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || [];
-
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
-
// Search in title
if (title.trim()) {
core.notice(`Searching in title: "${title}"`);
-
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
-
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
}
-
// Search in body
if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`);
-
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
-
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
}
-
if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`);
-
for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
-
if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
}
-
// Show details for each match
termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@@ -266,7 +244,6 @@ jobs:
}
});
}
-
shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@@ -274,13 +251,10 @@ jobs:
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
-
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
}
-
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`);
-
if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) {
@@ -296,14 +270,92 @@ jobs:
core.notice(`Label "${labelName}" already present.`);
return false;
}
-
core.notice(`No matching terms found for label "${labelName}".`);
return false;
}
-
// Process all configured labels
- const processLabels = Object.entries(labelConfig)
- .map(([labelName, config]) => processLabel(labelName, config));
- const labelsAdded = await Promise.all(processLabels);
- const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
- core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
\ No newline at end of file
+ const labelsAddedResults = await Promise.all(
+ Object.entries(labelConfig).map(([labelName, config]) =>
+ processLabel(labelName, config).then(added => ({ labelName, added }))
+ )
+ );
+
+ const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
+ core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+
+ // Return which labels were added for the next step
+ const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
+ core.setOutput('labels_added', JSON.stringify(addedLabels));
+ return addedLabels;
+
+ - name: CC users for labeled issues
+ if: steps.label-step.outputs.labels_added != '[]'
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+ with:
+ script: |
+ // Configuration: Map labels to GitHub users to CC
+ // You can add multiple users per label, and multiple label configurations
+ const ccConfig = {
+ rocm: {
+ users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
+ message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
+ },
+ // Add more label -> user mappings here
+ // Example:
+ // cuda: {
+ // users: ['user1', 'user2'],
+ // message: 'CC {users} for CUDA-related issue'
+ // },
+ // performance: {
+ // users: ['perfexpert'],
+ // message: 'CC {users} for performance issue'
+ // },
+ };
+
+ const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
+ core.notice(`Labels added: ${labelsAdded.join(', ')}`);
+
+ // Get existing comments to check for already mentioned users
+ const comments = await github.rest.issues.listComments({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ });
+
+ const issueBody = context.payload.issue.body || '';
+ const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
+
+ // Process each label that was added
+ for (const label of labelsAdded) {
+ if (ccConfig[label]) {
+ const config = ccConfig[label];
+ const usersToMention = [];
+
+ // Check which users haven't been mentioned yet
+ for (const user of config.users) {
+ const mentionPattern = new RegExp(`@${user}\\b`, 'i');
+ if (!mentionPattern.test(allExistingText)) {
+ usersToMention.push(user);
+ } else {
+ core.notice(`@${user} already mentioned for label "${label}", skipping`);
+ }
+ }
+
+ // Post comment if there are users to mention
+ if (usersToMention.length > 0) {
+ const mentions = usersToMention.map(u => `@${u}`).join(' ');
+ const message = config.message.replace('{users}', mentions);
+
+ await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: message
+ });
+
+ core.notice(`CC comment added for label "${label}": ${mentions}`);
+ } else {
+ core.notice(`All users for label "${label}" already mentioned, skipping comment`);
+ }
+ }
+ }
\ No newline at end of file
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 000000000000..f3dda4c25c79
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,82 @@
+# This workflow will upload a Python Package to Release asset
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Create Release
+
+on:
+ push:
+ tags:
+ - v*
+
+# Needed to create release and upload assets
+permissions:
+ contents: write
+
+jobs:
+ release:
+ # Retrieve tag and create release
+ name: Create Release
+ runs-on: self-hosted
+ container:
+ image: rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0
+ outputs:
+ upload_url: ${{ steps.create_release.outputs.upload_url }}
+ steps:
+ - name: Checkout
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+ - name: Extract branch info
+ shell: bash
+ run: |
+ echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
+
+ - name: Create Release
+ id: create_release
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ env:
+ RELEASE_TAG: ${{ env.release_tag }}
+ with:
+ github-token: "${{ secrets.GITHUB_TOKEN }}"
+ script: |
+ const script = require('.github/workflows/scripts/create_release.js')
+ await script(github, context, core)
+
+ wheel:
+ name: Build Wheel
+ runs-on: self-hosted
+ container:
+ image: rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0
+ needs: release
+
+ strategy:
+ fail-fast: false
+
+ steps:
+ - name: Prepare
+ run: |
+ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
+ pip3 install -U triton
+
+ - name: Checkout
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+ - name: Build wheel
+ shell: bash
+ env:
+ CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+ run: |
+ bash -x .github/workflows/scripts/build.sh
+ wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+ asset_name=${wheel_name//"linux"/"manylinux1"}
+ echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+ echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+
+ - name: Upload vllm Release Asset
+ uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ upload_url: ${{ needs.release.outputs.upload_url }}
+ asset_path: ./dist/${{ env.wheel_name }}
+ asset_name: ${{ env.asset_name }}
+ asset_content_type: application/*
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index c69ebbb42da5..fe4f7c952751 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -1,22 +1,20 @@
#!/bin/bash
set -eux
-python_executable=python$1
-cuda_home=/usr/local/cuda-$2
+python_executable=python3
# Update paths
-PATH=${cuda_home}/bin:$PATH
-LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
-
# Install requirements
-$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
+$python_executable -m pip install -r requirements/rocm.txt
# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-bash tools/check_repo.sh
+rm -f "$(which sccache)"
+
+export MAX_JOBS=32
# Build
$python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 82844810a633..dca3089f496c 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+ - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
index c86fed9555d6..cd9df57cd980 100644
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -4,7 +4,6 @@ MD013: false
MD024:
siblings_only: true
MD033: false
-MD042: false
MD045: false
MD046: false
MD051: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c16bdeeecd07..121bdb750de5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,30 +6,19 @@ default_stages:
- manual # Run in CI
exclude: 'vllm/third_party/.*'
repos:
-- repo: https://github.com/google/yapf
- rev: v0.43.0
- hooks:
- - id: yapf
- args: [--in-place, --verbose]
- # Keep the same list from yapfignore here to avoid yapf failing without any inputs
- exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.11.7
+ rev: v0.14.0
hooks:
- - id: ruff
+ - id: ruff-check
args: [--output-format, github, --fix]
- id: ruff-format
- files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
- rev: v1.35.5
+ rev: v1.38.1
hooks:
- id: typos
-- repo: https://github.com/PyCQA/isort
- rev: 6.0.1
- hooks:
- - id: isort
+ args: [--force-exclude]
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: v20.1.3
+ rev: v21.1.2
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@@ -46,10 +35,10 @@ repos:
hooks:
- id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
- rev: 0.6.17
+ rev: 0.9.1
hooks:
- id: pip-compile
- args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+ args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
@@ -60,38 +49,32 @@ repos:
files: ^requirements/test\.(in|txt)$
- id: mypy-local
name: Run mypy for local Python installation
- entry: tools/mypy.sh 0 "local"
- language: python
- types: [python]
- additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
+ entry: python tools/pre_commit/mypy.py 0 "local"
stages: [pre-commit] # Don't run in CI
- - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
- name: Run mypy for Python 3.9
- entry: tools/mypy.sh 1 "3.9"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
- stages: [manual] # Only run in CI
+ <<: &mypy_common
+ language: python
+ types_or: [python, pyi]
+ require_serial: true
+ additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
- entry: tools/mypy.sh 1 "3.10"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.10"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
- entry: tools/mypy.sh 1 "3.11"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.11"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
- entry: tools/mypy.sh 1 "3.12"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.12"
+ <<: *mypy_common
+ stages: [manual] # Only run in CI
+ - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+ name: Run mypy for Python 3.13
+ entry: python tools/pre_commit/mypy.py 1 "3.13"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
@@ -155,18 +138,15 @@ repos:
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
- entry: python tools/check_pickle_imports.py
+ entry: python tools/pre_commit/check_pickle_imports.py
language: python
types: [python]
- pass_filenames: false
- additional_dependencies: [pathspec, regex]
+ additional_dependencies: [regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
- types: [python]
- pass_filenames: true
- files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
+ additional_dependencies: [regex]
# Keep `suggestion` last
- id: suggestion
name: Suggestion
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 432975009068..d83d6df35ed9 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -13,6 +13,7 @@ build:
mkdocs:
configuration: mkdocs.yaml
+ fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:
diff --git a/.yapfignore b/.yapfignore
index 2d6dcf8380ca..38158259032a 100644
--- a/.yapfignore
+++ b/.yapfignore
@@ -1 +1,2 @@
collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f1f9a781a07..005590445361 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
@@ -30,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
#
# Supported/expected torch versions for CUDA/ROCm.
@@ -82,6 +86,9 @@ find_package(Torch REQUIRED)
# Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+ set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
+elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else()
@@ -171,6 +178,25 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
+#
+# Set compression mode for CUDA >=13.x.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA" AND
+ DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+ list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
+endif()
+
+#
+# Set CUDA include flags for CXX compiler.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
+ if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
+ endif()
+endif()
+
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@@ -243,8 +269,8 @@ set(VLLM_EXT_SRC
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
- "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
- "csrc/quantization/fp8/common.cu"
+ "csrc/quantization/w8a8/int8/scaled_quant.cu"
+ "csrc/quantization/w8a8/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
@@ -256,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
- set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+ set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -288,14 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
- "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
- "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
- "csrc/attention/mla/cutlass_mla_entry.cu"
- "csrc/quantization/fp8/per_token_group_quant.cu")
+ "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+ "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@@ -399,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -427,12 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -457,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -493,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@@ -537,7 +570,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
# CUDA 12.8 or later
- cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -556,7 +593,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# FP4 Archs and flags
- cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -578,10 +619,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# CUTLASS MLA Archs and flags
- cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS
- "csrc/attention/mla/cutlass_mla_kernels.cu"
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -605,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -623,9 +667,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -644,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# moe_data.cu is used by all CUTLASS MoE kernels.
- cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@@ -663,9 +715,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -779,6 +835,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
+ # Hadacore kernels
+ cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+ if(HADACORE_ARCHS)
+ set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${HADACORE_ARCHS}")
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+ message(STATUS "Building hadacore")
+ endif()
+
# if CUDA endif
endif()
@@ -940,6 +1007,7 @@ endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
+ include(cmake/external_projects/qutlass.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake)
diff --git a/README.md b/README.md
index 4e03df758c26..3dcdd7dc0094 100644
--- a/README.md
+++ b/README.md
@@ -14,10 +14,14 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |
+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
---
*Latest News* 🔥
+- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
@@ -78,7 +82,7 @@ vLLM is flexible and easy to use with:
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support
- Multi-LoRA support
@@ -145,6 +149,7 @@ Compute Resources:
- Trainy
- UC Berkeley
- UC San Diego
+- Volcengine
Slack Sponsor: Anyscale
diff --git a/ROCm_performance.md b/ROCm_performance.md
new file mode 100644
index 000000000000..2427423841db
--- /dev/null
+++ b/ROCm_performance.md
@@ -0,0 +1,21 @@
+# Overview of the optional performance features unique to
+
+## Triton attention
+
+The default attention function on ROCm is using triton attention kernel. To fallback to the implementation set up the following environment symbol:
+`VLLM_USE_TRITON_FLASH_ATTN=0`
+
+## Tunable ops
+
+Pytorch tunable ops are supported.
+Define the following environment symbol: `PYTORCH_TUNABLEOP_ENABLED=1` in order to enable both the runtime tuning and the subsequent use of tuned results. To only use the tuned results without tuning any newly encountered shapes, set `PYTORCH_TUNABLEOP_TUNING=0`
+
+## Custom PagedAttention
+
+On ROCm, to have better performance, a custom paged attention is available by switching on the env variable: `VLLM_USE_ROCM_CUSTOM_PAGED_ATTN=1`.
+Currently, this env variable is enabled by default. To fallback to PagedAttention v2 kernel assign the env variable to 0.
+The custom PagedAttention kernel is enabled for dtype: bf16, fp16, block-size=16, head-size=128, and max context length <= 16k, with GQA ratio (num_heads//num_kv_heads) between 1 to 16. On all the other cases, we fallback to PagedAttention v2 kernel.
+
+## NCCL Performance environment variable
+
+For MI300x, setting environment variable NCCL_MIN_NCHANNELS=112 is expected to improve performance.
diff --git a/benchmarks/P3L.py b/benchmarks/P3L.py
new file mode 100755
index 000000000000..793b88a4a61e
--- /dev/null
+++ b/benchmarks/P3L.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Patch-Perplexity (P3L)
+
+This is a script that produces a realistic PPL measurement
+for the quantized KV cache system by processing a sequence of
+non-overlapping patches of the reference text. Generation of the
+consecutive symbols in each patch is governed (forced)
+by the reference text.
+
+The initial context size for the system is set by the parameter
+"--context-size".
+
+The number of output symbols to generate starting from a given
+context is set by the parameter "--sample-size". This variable also
+defines the size of the individual patch.
+
+For the N-token reference text that is split into M patches with the
+system's context size C it takes M*preload + (N-C)*generation time.
+
+Quick correctness validation tips:
+
+Running llama-2-7b model
+(
+ ./vllm/examples/P3L.py
+ --model=meta-llama/Llama-2-7b-chat-hf
+ --context-size=1024
+ --sample-size=512
+)
+should result in PPL ~ 6.524227946419175
+
+Running llama-2-7b model
+(
+ ./vllm/examples/P3L.py
+ --model=meta-llama/Llama-2-7b-chat-hf
+ --context-size=1024
+ --sample-size=512
+ --patch-size=1
+)
+should result in PPL ~ PPL=3.8968611189957523
+
+Running the script with multiple batches is possible
+by specifying the --batch-size parameter.
+
+"""
+
+import argparse
+import dataclasses
+import datetime
+import json
+import math
+import os
+import tempfile
+
+from huggingface_hub import hf_hub_download
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+def get_wikitext2_text(tokenizer):
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ hf_hub_download(
+ repo_id="alexei-v-ivanov-amd/wiki",
+ repo_type="dataset",
+ filename="wiki.test.raw",
+ local_dir=tmpdirname,
+ )
+ with open(os.path.join(tmpdirname, "wiki.test.raw")) as f:
+ test_text = "\n".join(line.strip() for line in f)
+ test_enc = tokenizer(test_text)
+
+ return test_enc, test_text
+
+
+def vllm_init(args):
+ engine_args = EngineArgs.from_cli_args(args)
+ llm = LLM(**dataclasses.asdict(engine_args))
+
+ sampling_params = SamplingParams(
+ n=1,
+ temperature=0.0,
+ top_p=1,
+ ignore_eos=True,
+ ppl_measurement=True,
+ future_context=[],
+ prompt_logprobs=1,
+ logprobs=1,
+ presence_penalty=0.0,
+ )
+
+ return llm, sampling_params
+
+
+def vllm_predict(CONT, llm, sampl_par):
+ result = llm.generate(prompt_token_ids=CONT, sampling_params=sampl_par)
+ return result
+
+
+def main(args: argparse.Namespace):
+ MESSAGE = f"Initialising @ {datetime.datetime.now()}"
+ logger.info(MESSAGE)
+ print(MESSAGE)
+ my_ppl = 0.0
+
+ logger.info("Initializing the engine.")
+ my_llm, my_sampl_par = vllm_init(args)
+ my_tokenizer = my_llm.llm_engine.tokenizer.tokenizer
+ logger.info(my_sampl_par)
+ logger.info("Initialized the engine.")
+
+ my_n_samples = args.sample_size
+
+ if (
+ args.context_size + my_n_samples
+ ) > my_llm.llm_engine.model_config.max_model_len:
+ MESSAGE = (
+ ""
+ "Error! The total number of tokens:\n"
+ f" prefix ({args.context_size}) + "
+ f"to be generated ({my_n_samples})"
+ f" can't be bigger than the model limit "
+ f"({my_llm.llm_engine.model_config.max_model_len})."
+ )
+ logger.info(MESSAGE)
+ print(MESSAGE)
+ return
+
+ my_test_enc, my_test_text = get_wikitext2_text(my_tokenizer)
+ logger.info("Loaded the test data.")
+
+ my_n_patches = math.ceil(
+ (len(my_test_enc["input_ids"]) - args.context_size - 1) / my_n_samples
+ )
+ if args.patch_size is not None:
+ my_n_patches = args.patch_size
+
+ num_tokens_generated = 0
+ starting_time = datetime.datetime.now()
+ MESSAGE = (
+ f"Starting generation @ {starting_time}\n"
+ " Have the test sample of "
+ f"{len(my_test_enc['input_ids'])} tokens"
+ f" will try to process {my_n_patches} patche(s),"
+ f" generating {my_n_samples} tokens in each patch"
+ f" from the initial context of {args.context_size} tokens."
+ )
+
+ logger.info(MESSAGE)
+ print(MESSAGE)
+
+ my_batchsize = args.batch_size
+
+ for c in range(0, my_n_patches, my_batchsize):
+ CONTEXT = []
+ my_sampl_par.future_context = []
+ my_sampl_par.cntr = []
+
+ for b in range(my_batchsize):
+ if (c + b) < my_n_patches:
+ upper_boundary = min(
+ (c + b + 1) * my_n_samples + args.context_size,
+ len(my_test_enc["input_ids"]),
+ )
+ CONTEXT.append(
+ my_test_enc["input_ids"][
+ (c + b) * my_n_samples : (c + b) * my_n_samples
+ + args.context_size
+ ]
+ )
+
+ my_sampl_par.future_context.append(
+ my_test_enc["input_ids"][
+ (c + b) * my_n_samples + args.context_size : upper_boundary
+ ]
+ )
+
+ my_sampl_par.cntr.append(c + b)
+
+ my_sampl_par.max_tokens = max(
+ len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT))
+ )
+
+ LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
+ for b in range(len(CONTEXT)):
+ num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
+ my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
+
+ if num_tokens_generated < my_n_samples * len(CONTEXT):
+ MESSAGE = (
+ f"Warning: The number of generated tokens is"
+ f"less than requested ({num_tokens_generated}"
+ f" < {my_n_samples * len(CONTEXT)})."
+ )
+ logger.info(MESSAGE)
+ print(MESSAGE)
+
+ MESSAGE = (
+ f"Iterations {c + 1} through {c + len(CONTEXT)}"
+ f" of {my_n_patches} Intermediate "
+ "Estimates:\n"
+ f"\tCross-entropy_intermediate={my_ppl / num_tokens_generated}\n"
+ f"\tPerplexity_intermediate="
+ f"{math.exp(my_ppl / num_tokens_generated)}"
+ )
+
+ logger.info(MESSAGE)
+ print(MESSAGE)
+
+ ending_time = datetime.datetime.now()
+ MESSAGE = (
+ f"Done @ {ending_time} after processing for"
+ f" {ending_time - starting_time}"
+ f" generated {num_tokens_generated} tokens."
+ )
+
+ logger.info(MESSAGE)
+ print(MESSAGE)
+
+ MESSAGE = (
+ f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy="
+ f"{my_ppl / num_tokens_generated}"
+ f"\n\tPPL={math.exp(my_ppl / num_tokens_generated)}"
+ )
+
+ if args.output_json:
+ results = {
+ "integral_cross_entropy": my_ppl,
+ "average_cross_entropy": my_ppl / num_tokens_generated,
+ "ppl": math.exp(my_ppl / num_tokens_generated),
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
+ logger.info(MESSAGE)
+ print(MESSAGE)
+ return
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="Measure the PPPL (P3L) score of a given model."
+ )
+ parser.add_argument("--context-size", type=int, default=4096)
+ parser.add_argument("--sample-size", type=int, default=512)
+ parser.add_argument("--batch-size", type=int, default=1)
+ parser.add_argument("--patch-size", type=int, default=None)
+ parser.add_argument(
+ "--output-json",
+ type=str,
+ default=None,
+ help="Path to save the latency results in JSON format.",
+ )
+
+ parser = EngineArgs.add_cli_args(parser)
+ args = parser.parse_args()
+
+ main(args)
diff --git a/benchmarks/P3L_mling.py b/benchmarks/P3L_mling.py
new file mode 100755
index 000000000000..7055745e601e
--- /dev/null
+++ b/benchmarks/P3L_mling.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+*MULTILINGUAL* Patch-Perplexity (P3L)
+
+This is a script that produces a realistic PPL measurement
+for the quantized KV cache system by processing a sequence of
+non-overlapping patches of the reference text. Generation of the
+consecutive symbols in each patch is governed (forced)
+by the reference text.
+
+The initial context size for the system is set by the parameter
+"--context-size".
+
+The number of output symbols to generate starting from a given
+context is set by the parameter "--sample-size". This variable also
+defines the size of the individual patch.
+
+For the N-token reference text that is split into M patches with the
+system's context size C it takes M*preload + (N-C)*generation time.
+
+Quick correctness validation tips:
+
+Running DeepSeek-V2 model
+(
+ ./vllm/examples/P3L_mling.py
+ --model=meta-llama/Llama-2-7b-chat-hf
+ --context-size=1024
+ --sample-size=512
+)
+
+should result in PPL ~ 8.42927
+
+Running DeepSeek-V2 model
+(
+ ./vllm/examples/P3L_mling.py
+ --model=meta-llama/Llama-2-7b-chat-hf
+ --context-size=1024
+ --sample-size=512
+ --patch-size=1
+ --lang-script="cmn_Hant"
+)
+should result in PPL ~ 2.67962
+
+The multi-linguality is implemented through the additional
+key "--lang-script", which defaults to English in Latin
+scripture ("eng_Latn").
+
+Please refer to
+
+https://confluence.amd.com/display/MLSE/Multi-Lingual+P3L+Test
+
+for the complete set of possible language-scripture choices.
+
+Running the script with multiple batches is possible
+by specifying the --batch-size parameter.
+
+"""
+
+import argparse
+import dataclasses
+import datetime
+import json
+import math
+import os
+import tempfile
+
+import pandas
+from huggingface_hub import hf_hub_download
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+def get_wikitext2_text(tokenizer):
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ hf_hub_download(
+ repo_id="alexei-v-ivanov-amd/wiki",
+ repo_type="dataset",
+ filename="wiki.test.raw",
+ local_dir=tmpdirname,
+ )
+ with open(os.path.join(tmpdirname, "wiki.test.raw")) as f:
+ test_text = "\n".join(line.strip() for line in f)
+ test_enc = tokenizer(test_text)
+
+ return test_enc, test_text
+
+
+def get_flores_plus_text(tokenizer, lng_script):
+ hf_hub_download(
+ repo_id="alexei-v-ivanov-amd/flores_plus",
+ repo_type="dataset",
+ filename=lng_script + ".parquet",
+ local_dir="./",
+ )
+
+ df = pandas.read_parquet("./" + lng_script + ".parquet")
+ test_text = "\n\n".join(line.strip() for line in df["text"])
+ test_enc = tokenizer(test_text)
+
+ os.remove("./" + lng_script + ".parquet")
+
+ return test_enc, test_text
+
+
+def vllm_init(args):
+ engine_args = EngineArgs.from_cli_args(args)
+ llm = LLM(**dataclasses.asdict(engine_args))
+
+ sampling_params = SamplingParams(
+ n=1,
+ temperature=0.0,
+ top_p=1,
+ ignore_eos=True,
+ ppl_measurement=True,
+ future_context=[],
+ prompt_logprobs=1,
+ logprobs=1,
+ presence_penalty=0.0,
+ )
+
+ return llm, sampling_params
+
+
+def vllm_predict(CONT, llm, sampl_par):
+ result = llm.generate(prompt_token_ids=CONT, sampling_params=sampl_par)
+ return result
+
+
+def main(args: argparse.Namespace):
+ MESSAGE = f"Initialising @ {datetime.datetime.now()}"
+ logger.info(MESSAGE)
+ print(MESSAGE)
+ my_ppl = 0.0
+
+ logger.info("Initializing the engine.")
+ my_llm, my_sampl_par = vllm_init(args)
+ my_tokenizer = my_llm.llm_engine.tokenizer.tokenizer
+ logger.info(my_sampl_par)
+ logger.info("Initialized the engine.")
+
+ my_n_samples = args.sample_size
+ my_lang_script = args.lang_script
+
+ if (
+ args.context_size + my_n_samples
+ ) > my_llm.llm_engine.model_config.max_model_len:
+ MESSAGE = (
+ ""
+ "Error! The total number of tokens:\n"
+ f" prefix ({args.context_size}) + "
+ f"to be generated ({my_n_samples})"
+ f" can't be bigger than the model limit "
+ f"({my_llm.llm_engine.model_config.max_model_len})."
+ )
+ logger.info(MESSAGE)
+ print(MESSAGE)
+ return
+
+ my_test_enc, my_test_text = get_flores_plus_text(my_tokenizer, my_lang_script)
+
+ logger.info("Loaded the test data.")
+
+ my_n_patches = math.ceil(
+ (len(my_test_enc["input_ids"]) - args.context_size - 1) / my_n_samples
+ )
+ if args.patch_size is not None:
+ my_n_patches = args.patch_size
+
+ num_tokens_generated = 0
+ starting_time = datetime.datetime.now()
+ MESSAGE = (
+ f"Starting generation @ {starting_time}\n"
+ " Have the test sample of "
+ f"{len(my_test_enc['input_ids'])} tokens"
+ f" will try to process {my_n_patches} patche(s),"
+ f" generating {my_n_samples} tokens in each patch"
+ f" from the initial context of {args.context_size} tokens."
+ )
+
+ logger.info(MESSAGE)
+ print(MESSAGE)
+
+ my_batchsize = args.batch_size
+
+ for c in range(0, my_n_patches, my_batchsize):
+ CONTEXT = []
+ my_sampl_par.future_context = []
+ my_sampl_par.cntr = []
+
+ for b in range(my_batchsize):
+ if (c + b) < my_n_patches:
+ upper_boundary = min(
+ (c + b + 1) * my_n_samples + args.context_size,
+ len(my_test_enc["input_ids"]),
+ )
+ CONTEXT.append(
+ my_test_enc["input_ids"][
+ (c + b) * my_n_samples : (c + b) * my_n_samples
+ + args.context_size
+ ]
+ )
+
+ my_sampl_par.future_context.append(
+ my_test_enc["input_ids"][
+ (c + b) * my_n_samples + args.context_size : upper_boundary
+ ]
+ )
+
+ my_sampl_par.cntr.append(c + b)
+
+ my_sampl_par.max_tokens = max(
+ len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT))
+ )
+
+ LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
+ for b in range(len(CONTEXT)):
+ num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
+ my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
+
+ if num_tokens_generated < my_n_samples * len(CONTEXT):
+ MESSAGE = (
+ f"Warning: The number of generated tokens is"
+ f"less than requested ({num_tokens_generated}"
+ f" < {my_n_samples * len(CONTEXT)})."
+ )
+ logger.info(MESSAGE)
+ print(MESSAGE)
+
+ MESSAGE = (
+ f"Iterations {c + 1} through {c + len(CONTEXT)}"
+ f" of {my_n_patches} Intermediate "
+ "Estimates:\n"
+ f"\tCross-entropy_intermediate={my_ppl / num_tokens_generated}\n"
+ f"\tPerplexity_intermediate="
+ f"{math.exp(my_ppl / num_tokens_generated)}"
+ )
+
+ logger.info(MESSAGE)
+ print(MESSAGE)
+
+ ending_time = datetime.datetime.now()
+ MESSAGE = (
+ f"Done @ {ending_time} after processing for"
+ f" {ending_time - starting_time}"
+ f" generated {num_tokens_generated} tokens."
+ )
+
+ logger.info(MESSAGE)
+ print(MESSAGE)
+
+ MESSAGE = (
+ f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy="
+ f"{my_ppl / num_tokens_generated}"
+ f"\n\tPPL={math.exp(my_ppl / num_tokens_generated)}"
+ )
+
+ if args.output_json:
+ results = {
+ "integral_cross_entropy": my_ppl,
+ "average_cross_entropy": my_ppl / num_tokens_generated,
+ "ppl": math.exp(my_ppl / num_tokens_generated),
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
+ logger.info(MESSAGE)
+ print(MESSAGE)
+ return
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="Measure the PPPL (P3L) score of a given model."
+ )
+ parser.add_argument(
+ "--data",
+ type=str,
+ default="./wikitext/wikitext-2-v1/test-00000-of-00001.parquet",
+ )
+ parser.add_argument("--context-size", type=int, default=4096)
+ parser.add_argument("--sample-size", type=int, default=512)
+ parser.add_argument("--batch-size", type=int, default=1)
+ parser.add_argument("--patch-size", type=int, default=None)
+ parser.add_argument("--lang-script", type=str, default="eng_Latn")
+ parser.add_argument(
+ "--output-json",
+ type=str,
+ default=None,
+ help="Path to save the latency results in JSON format.",
+ )
+
+ parser = EngineArgs.add_cli_args(parser)
+ args = parser.parse_args()
+
+ main(args)
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 957c2f988051..269a4d51ec2e 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,807 +1,20 @@
-# Benchmarking vLLM
+# Benchmarks
-This README guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It’s a living document, updated as new features and datasets
-become available.
+This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
-## Dataset Overview
+## Contents
-
-
-
- | Dataset |
- Online |
- Offline |
- Data Path |
-
-
-
-
- | ShareGPT |
- ✅ |
- ✅ |
- wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json |
-
-
- | ShareGPT4V (Image) |
- ✅ |
- ✅ |
-
- wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
-
- Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
- wget http://images.cocodataset.org/zips/train2017.zip
- |
-
-
- | ShareGPT4Video (Video) |
- ✅ |
- ✅ |
-
- git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video
- |
-
-
- | BurstGPT |
- ✅ |
- ✅ |
- wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv |
-
-
- | Sonnet (deprecated) |
- ✅ |
- ✅ |
- Local file: benchmarks/sonnet.txt |
-
-
- | Random |
- ✅ |
- ✅ |
- synthetic |
-
-
- | RandomMultiModal (Image/Video) |
- 🟡 |
- 🚧 |
- synthetic |
-
-
- | Prefix Repetition |
- ✅ |
- ✅ |
- synthetic |
-
-
- | HuggingFace-VisionArena |
- ✅ |
- ✅ |
- lmarena-ai/VisionArena-Chat |
-
-
- | HuggingFace-InstructCoder |
- ✅ |
- ✅ |
- likaixin/InstructCoder |
-
-
- | HuggingFace-AIMO |
- ✅ |
- ✅ |
- AI-MO/aimo-validation-aime , AI-MO/NuminaMath-1.5, AI-MO/NuminaMath-CoT |
-
-
- | HuggingFace-Other |
- ✅ |
- ✅ |
- lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered |
-
-
- | Custom |
- ✅ |
- ✅ |
- Local file: data.jsonl |
-
-
-
+- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
+- **Throughput benchmarks**: Scripts for testing offline batch inference performance
+- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
+- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
-✅: supported
+## Usage
-🟡: Partial support
+For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
-🚧: to be supported
+For full CLI reference see:
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
-For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
-
-```bash
---dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
-```
-
-## 🚀 Example - Online Benchmark
-
-
-Show more
-
-
-
-First start serving your model
-
-```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B
-```
-
-Then run the benchmarking script
-
-```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench serve \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --endpoint /v1/completions \
- --dataset-name sharegpt \
- --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```text
-============ Serving Benchmark Result ============
-Successful requests: 10
-Benchmark duration (s): 5.78
-Total input tokens: 1369
-Total generated tokens: 2212
-Request throughput (req/s): 1.73
-Output token throughput (tok/s): 382.89
-Total Token throughput (tok/s): 619.85
----------------Time to First Token----------------
-Mean TTFT (ms): 71.54
-Median TTFT (ms): 73.88
-P99 TTFT (ms): 79.49
------Time per Output Token (excl. 1st token)------
-Mean TPOT (ms): 7.91
-Median TPOT (ms): 7.96
-P99 TPOT (ms): 8.03
----------------Inter-token Latency----------------
-Mean ITL (ms): 7.74
-Median ITL (ms): 7.70
-P99 ITL (ms): 8.39
-==================================================
-```
-
-### Custom Dataset
-
-If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
-
-```json
-{"prompt": "What is the capital of India?"}
-{"prompt": "What is the capital of Iran?"}
-{"prompt": "What is the capital of China?"}
-```
-
-```bash
-# start server
-VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
-```
-
-```bash
-# run benchmarking script
-vllm bench serve --port 9001 --save-result --save-detailed \
- --backend vllm \
- --model meta-llama/Llama-3.1-8B-Instruct \
- --endpoint /v1/completions \
- --dataset-name custom \
- --dataset-path \
- --custom-skip-chat-template \
- --num-prompts 80 \
- --max-concurrency 1 \
- --temperature=0.3 \
- --top-p=0.75 \
- --result-dir "./log/"
-```
-
-You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-# need a model with vision capability here
-vllm serve Qwen/Qwen2-VL-7B-Instruct
-```
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path lmarena-ai/VisionArena-Chat \
- --hf-split train \
- --num-prompts 1000
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
- --speculative-config $'{"method": "ngram",
- "num_speculative_tokens": 5, "prompt_lookup_max": 5,
- "prompt_lookup_min": 2}'
-```
-
-``` bash
-vllm bench serve \
- --model meta-llama/Meta-Llama-3-8B-Instruct \
- --dataset-name hf \
- --dataset-path likaixin/InstructCoder \
- --num-prompts 2048
-```
-
-### Other HuggingFaceDataset Examples
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct
-```
-
-`lmms-lab/LLaVA-OneVision-Data`:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path lmms-lab/LLaVA-OneVision-Data \
- --hf-split train \
- --hf-subset "chart2text(cauldron)" \
- --num-prompts 10
-```
-
-`Aeala/ShareGPT_Vicuna_unfiltered`:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --endpoint-type openai-chat \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name hf \
- --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
- --hf-split train \
- --num-prompts 10
-```
-
-`AI-MO/aimo-validation-aime`:
-
-``` bash
-vllm bench serve \
- --model Qwen/QwQ-32B \
- --dataset-name hf \
- --dataset-path AI-MO/aimo-validation-aime \
- --num-prompts 10 \
- --seed 42
-```
-
-`philschmid/mt-bench`:
-
-``` bash
-vllm bench serve \
- --model Qwen/QwQ-32B \
- --dataset-name hf \
- --dataset-path philschmid/mt-bench \
- --num-prompts 80
-```
-
-### Running With Sampling Parameters
-
-When using OpenAI-compatible backends such as `vllm`, optional sampling
-parameters can be specified. Example client command:
-
-```bash
-vllm bench serve \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --endpoint /v1/completions \
- --dataset-name sharegpt \
- --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --top-k 10 \
- --top-p 0.9 \
- --temperature 0.5 \
- --num-prompts 10
-```
-
-### Running With Ramp-Up Request Rate
-
-The benchmark tool also supports ramping up the request rate over the
-duration of the benchmark run. This can be useful for stress testing the
-server or finding the maximum throughput that it can handle, given some latency budget.
-
-Two ramp-up strategies are supported:
-
-- `linear`: Increases the request rate linearly from a start value to an end value.
-- `exponential`: Increases the request rate exponentially.
-
-The following arguments can be used to control the ramp-up:
-
-- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
-- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
-- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
-
-
-
-## 📈 Example - Offline Throughput Benchmark
-
-
-Show more
-
-
-
-```bash
-vllm bench throughput \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset-name sonnet \
- --dataset-path vllm/benchmarks/sonnet.txt \
- --num-prompts 10
-```
-
-If successful, you will see the following output
-
-```text
-Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
-Total num prompt tokens: 5014
-Total num output tokens: 1500
-```
-
-### VisionArena Benchmark for Vision Language Models
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path lmarena-ai/VisionArena-Chat \
- --num-prompts 1000 \
- --hf-split train
-```
-
-The `num prompt tokens` now includes image token counts
-
-```text
-Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
-Total num prompt tokens: 14527
-Total num output tokens: 1280
-```
-
-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
-vllm bench throughput \
- --dataset-name=hf \
- --dataset-path=likaixin/InstructCoder \
- --model=meta-llama/Meta-Llama-3-8B-Instruct \
- --input-len=1000 \
- --output-len=100 \
- --num-prompts=2048 \
- --async-engine \
- --speculative-config $'{"method": "ngram",
- "num_speculative_tokens": 5, "prompt_lookup_max": 5,
- "prompt_lookup_min": 2}'
-```
-
-```text
-Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
-Total num prompt tokens: 261136
-Total num output tokens: 204800
-```
-
-### Other HuggingFaceDataset Examples
-
-`lmms-lab/LLaVA-OneVision-Data`:
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path lmms-lab/LLaVA-OneVision-Data \
- --hf-split train \
- --hf-subset "chart2text(cauldron)" \
- --num-prompts 10
-```
-
-`Aeala/ShareGPT_Vicuna_unfiltered`:
-
-```bash
-vllm bench throughput \
- --model Qwen/Qwen2-VL-7B-Instruct \
- --backend vllm-chat \
- --dataset-name hf \
- --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
- --hf-split train \
- --num-prompts 10
-```
-
-`AI-MO/aimo-validation-aime`:
-
-```bash
-vllm bench throughput \
- --model Qwen/QwQ-32B \
- --backend vllm \
- --dataset-name hf \
- --dataset-path AI-MO/aimo-validation-aime \
- --hf-split train \
- --num-prompts 10
-```
-
-Benchmark with LoRA adapters:
-
-``` bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-vllm bench throughput \
- --model meta-llama/Llama-2-7b-hf \
- --backend vllm \
- --dataset_path /ShareGPT_V3_unfiltered_cleaned_split.json \
- --dataset_name sharegpt \
- --num-prompts 10 \
- --max-loras 2 \
- --max-lora-rank 8 \
- --enable-lora \
- --lora-path yard1/llama-2-7b-sql-lora-test
- ```
-
-
-
-## 🛠️ Example - Structured Output Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of structured output generation (JSON, grammar, regex).
-
-### Server Setup
-
-```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B
-```
-
-### JSON Schema Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset json \
- --structured-output-ratio 1.0 \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Grammar-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset grammar \
- --structure-type grammar \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Regex-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset regex \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### Choice-based Generation Benchmark
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset choice \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-### XGrammar Benchmark Dataset
-
-```bash
-python3 benchmarks/benchmark_serving_structured_output.py \
- --backend vllm \
- --model NousResearch/Hermes-3-Llama-3.1-8B \
- --dataset xgrammar_bench \
- --request-rate 10 \
- --num-prompts 1000
-```
-
-
-
-## 📚 Example - Long Document QA Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of long document question-answering with prefix caching.
-
-### Basic Long Document QA Test
-
-```bash
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 16 \
- --document-length 2000 \
- --output-len 50 \
- --repeat-count 5
-```
-
-### Different Repeat Modes
-
-```bash
-# Random mode (default) - shuffle prompts randomly
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode random
-
-# Tile mode - repeat entire prompt list in sequence
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode tile
-
-# Interleave mode - repeat each prompt consecutively
-python3 benchmarks/benchmark_long_document_qa_throughput.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-documents 8 \
- --document-length 3000 \
- --repeat-count 3 \
- --repeat-mode interleave
-```
-
-
-
-## 🗂️ Example - Prefix Caching Benchmark
-
-
-Show more
-
-
-
-Benchmark the efficiency of automatic prefix caching.
-
-### Fixed Prompt with Prefix Caching
-
-```bash
-python3 benchmarks/benchmark_prefix_caching.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --enable-prefix-caching \
- --num-prompts 1 \
- --repeat-count 100 \
- --input-length-range 128:256
-```
-
-### ShareGPT Dataset with Prefix Caching
-
-```bash
-# download dataset
-# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-python3 benchmarks/benchmark_prefix_caching.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
- --enable-prefix-caching \
- --num-prompts 20 \
- --repeat-count 5 \
- --input-length-range 128:256
-```
-
-### Prefix Repetition Dataset
-
-```bash
-vllm bench serve \
- --backend openai \
- --model meta-llama/Llama-2-7b-chat-hf \
- --dataset-name prefix_repetition \
- --num-prompts 100 \
- --prefix-repetition-prefix-len 512 \
- --prefix-repetition-suffix-len 128 \
- --prefix-repetition-num-prefixes 5 \
- --prefix-repetition-output-len 128
-```
-
-
-
-## ⚡ Example - Request Prioritization Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of request prioritization in vLLM.
-
-### Basic Prioritization Test
-
-```bash
-python3 benchmarks/benchmark_prioritization.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --input-len 128 \
- --output-len 64 \
- --num-prompts 100 \
- --scheduling-policy priority
-```
-
-### Multiple Sequences per Prompt
-
-```bash
-python3 benchmarks/benchmark_prioritization.py \
- --model meta-llama/Llama-2-7b-chat-hf \
- --input-len 128 \
- --output-len 64 \
- --num-prompts 100 \
- --scheduling-policy priority \
- --n 2
-```
-
-
-
-## 👁️ Example - Multi-Modal Benchmark
-
-
-Show more
-
-
-
-Benchmark the performance of multi-modal requests in vLLM.
-
-### Images (ShareGPT4V)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dtype bfloat16 \
- --limit-mm-per-prompt '{"image": 1}' \
- --allowed-local-media-path /path/to/sharegpt4v/images
-```
-
-Send requests with images:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dataset-name sharegpt \
- --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
- --num-prompts 100 \
- --save-result \
- --result-dir ~/vllm_benchmark_results \
- --save-detailed \
- --endpoint /v1/chat/completion
-```
-
-### Videos (ShareGPT4Video)
-
-Start vLLM:
-
-```bash
-python -m vllm.entrypoints.openai.api_server \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dtype bfloat16 \
- --limit-mm-per-prompt '{"video": 1}' \
- --allowed-local-media-path /path/to/sharegpt4video/videos
-```
-
-Send requests with videos:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --model Qwen/Qwen2.5-VL-7B-Instruct \
- --dataset-name sharegpt \
- --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
- --num-prompts 100 \
- --save-result \
- --result-dir ~/vllm_benchmark_results \
- --save-detailed \
- --endpoint /v1/chat/completion
-```
-
-### Synthetic Random Images (random-mm)
-
-Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
-
-Notes:
-
-- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
-- Video sampling is not yet implemented.
-
-Start the server (example):
-
-```bash
-vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
- --dtype bfloat16 \
- --max-model-len 16384 \
- --limit-mm-per-prompt '{"image": 3, "video": 0}' \
- --mm-processor-kwargs max_pixels=1003520
-```
-
-Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
-
-Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
-
-```bash
-vllm bench serve \
- --backend openai-chat \
- --model Qwen/Qwen2.5-VL-3B-Instruct \
- --endpoint /v1/chat/completions \
- --dataset-name random-mm \
- --num-prompts 100 \
- --max-concurrency 10 \
- --random-prefix-len 25 \
- --random-input-len 300 \
- --random-output-len 40 \
- --random-range-ratio 0.2 \
- --random-mm-base-items-per-request 2 \
- --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
- --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
- --request-rate inf \
- --ignore-eos \
- --seed 42
-```
-
-The number of items per request can be controlled by passing multiple image buckets:
-
-```bash
- --random-mm-base-items-per-request 2 \
- --random-mm-num-mm-items-range-ratio 0.5 \
- --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
- --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
-```
-
-Flags specific to `random-mm`:
-
-- `--random-mm-base-items-per-request`: base number of multimodal items per request.
-- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
-- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
-- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
-
-Behavioral notes:
-
-- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
-
-How sampling works:
-
-- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
-- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
-- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
-This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
-- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
-
-
+-
+-
+-
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index 3aa988aac254..d1bdb4c43f10 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -149,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
+
+## Batched `auto_tune`
+
+The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
+
+### Prerequisites
+
+- **jq**: This script requires `jq` to parse the JSON configuration file.
+- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
+
+### How to Run
+
+1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
+
+2. **Execute the script**:
+
+ ```bash
+ bash batch_auto_tune.sh [gcs_upload_path]
+ ```
+
+ - ``: **Required.** Path to your JSON configuration file.
+ - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
+
+### Configuration File
+
+The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
+
+Here is an example `runs_config.json` with two benchmark configurations:
+
+```json
+[
+ {
+ "base": "/home/user",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "system": "TPU", # OR GPU
+ "tp": 8,
+ "input_len": 128,
+ "output_len": 2048,
+ "max_model_len": 2300,
+ "num_seqs_list": "128 256",
+ "num_batched_tokens_list": "8192 16384"
+ },
+ {
+ "base": "/home/user",
+ "model": "meta-llama/Llama-3.1-70B-Instruct",
+ "system": "TPU", # OR GPU
+ "tp": 8,
+ "input_len": 4000,
+ "output_len": 16,
+ "max_model_len": 4096,
+ "num_seqs_list": "64 128",
+ "num_batched_tokens_list": "4096 8192",
+ "max_latency_allowed_ms": 500
+ }
+]
+```
+
+### Output
+
+The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
+
+- `run_id`: A unique identifier for the run, derived from the timestamp.
+- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
+- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
+- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
+
+A summary of successful and failed runs is also printed to the console upon completion.
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index ed3679b66f80..56b721cbb402 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -74,7 +74,7 @@ start_server() {
local vllm_log=$4
local profile_dir=$5
- pkill -if vllm
+ pkill -if "vllm serve" || true
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
@@ -96,17 +96,22 @@ start_server() {
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
- VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
+ VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
- VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
+ VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
+ local server_pid=$!
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
+ # This line checks whether the server is still alive or not,
+ # since that we should always have permission to send signal to the server process.
+ kill -0 $server_pid 2> /dev/null || break
+
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
@@ -118,7 +123,7 @@ start_server() {
done
if (( ! server_started )); then
- echo "server did not start within 10 minutes. Please check server log at $vllm_log".
+ echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
return 1
else
return 0
@@ -134,7 +139,7 @@ run_benchmark() {
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
- pkill -if vllm
+ pkill -if "vllm serve" || true
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
@@ -227,7 +232,7 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
- pkill -if vllm
+ pkill -if "vllm serve" || true
sleep 10
echo "===================="
return 0
@@ -303,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
else
echo "No configuration met the latency requirements. Skipping final profiling run."
fi
-pkill -if vllm
+pkill -if "vllm serve" || true
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
diff --git a/benchmarks/auto_tune/batch_auto_tune.sh b/benchmarks/auto_tune/batch_auto_tune.sh
new file mode 100755
index 000000000000..57ef20daf6b7
--- /dev/null
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+INPUT_JSON="$1"
+GCS_PATH="$2" # Optional GCS path for uploading results for each run
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
+
+if [[ -z "$INPUT_JSON" ]]; then
+ echo "Error: Input JSON file not provided."
+ echo "Usage: $0 [gcs_upload_path]"
+ exit 1
+fi
+
+if [[ ! -f "$INPUT_JSON" ]]; then
+ echo "Error: File not found at '$INPUT_JSON'"
+ exit 1
+fi
+
+if ! command -v jq &> /dev/null; then
+ echo "Error: 'jq' command not found. Please install jq to process the JSON input."
+ exit 1
+fi
+
+if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
+ echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
+ exit 1
+fi
+
+SUCCESS_COUNT=0
+FAILURE_COUNT=0
+FAILED_RUNS=()
+SCRIPT_START_TIME=$(date +%s)
+
+json_content=$(cat "$INPUT_JSON")
+if ! num_runs=$(echo "$json_content" | jq 'length'); then
+ echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
+ exit 1
+fi
+
+echo "Found $num_runs benchmark configurations in $INPUT_JSON."
+echo "Starting benchmark runs..."
+echo "--------------------------------------------------"
+
+for i in $(seq 0 $(($num_runs - 1))); do
+ run_object=$(echo "$json_content" | jq ".[$i]")
+
+ RUN_START_TIME=$(date +%s)
+ ENV_VARS_ARRAY=()
+ # Dynamically create env vars from the JSON object's keys
+ for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
+ value=$(echo "$run_object" | jq -r ".$key")
+ var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
+ ENV_VARS_ARRAY+=("${var_name}=${value}")
+ done
+
+ echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
+
+ # Execute auto_tune.sh and capture output
+ RUN_OUTPUT_FILE=$(mktemp)
+ if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
+ STATUS="SUCCESS"
+ ((SUCCESS_COUNT++))
+ else
+ STATUS="FAILURE"
+ ((FAILURE_COUNT++))
+ FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
+ fi
+
+ RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
+ rm "$RUN_OUTPUT_FILE"
+
+ # Parse results and optionally upload them to GCS
+ RUN_ID=""
+ RESULTS=""
+ GCS_RESULTS_URL=""
+ if [[ "$STATUS" == "SUCCESS" ]]; then
+ RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
+
+ if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
+ RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
+ RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
+ RESULTS=$(cat "$RESULT_FILE_PATH")
+
+ if [[ -n "$GCS_PATH" ]]; then
+ GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
+ echo "Uploading results to GCS..."
+ if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
+ echo "GCS upload successful."
+ else
+ echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
+ fi
+ fi
+ else
+ echo "Warning: Could not find result file for a successful run."
+ STATUS="WARNING_NO_RESULT_FILE"
+ fi
+ fi
+
+ # Add the results back into the JSON object for this run
+ json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
+ '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
+
+ RUN_END_TIME=$(date +%s)
+ echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
+ echo "--------------------------------------------------"
+
+ # Save intermediate progress back to the file
+ echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
+
+done
+
+SCRIPT_END_TIME=$(date +%s)
+echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
+echo
+echo "====================== SUMMARY ======================"
+echo "Successful runs: $SUCCESS_COUNT"
+echo "Failed runs: $FAILURE_COUNT"
+echo "==================================================="
+
+if [[ $FAILURE_COUNT -gt 0 ]]; then
+ echo "Details of failed runs (see JSON file for full parameters):"
+ for failed in "${FAILED_RUNS[@]}"; do
+ echo " - $failed"
+ done
+fi
+
+echo "Updated results have been saved to '$INPUT_JSON'."
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index ba7c733be0b2..4021fede7215 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -8,7 +8,6 @@
import time
import traceback
from dataclasses import dataclass, field
-from typing import Optional, Union
import aiohttp
import huggingface_hub.constants
@@ -28,13 +27,13 @@ class RequestFuncInput:
prompt_len: int
output_len: int
model: str
- model_name: Optional[str] = None
- logprobs: Optional[int] = None
- extra_body: Optional[dict] = None
- multi_modal_content: Optional[dict | list[dict]] = None
+ model_name: str | None = None
+ logprobs: int | None = None
+ extra_body: dict | None = None
+ multi_modal_content: dict | list[dict] | None = None
ignore_eos: bool = False
- language: Optional[str] = None
- request_id: Optional[str] = None
+ language: str | None = None
+ request_id: str | None = None
@dataclass
@@ -52,7 +51,7 @@ class RequestFuncOutput:
async def async_request_tgi(
request_func_input: RequestFuncInput,
- pbar: Optional[tqdm] = None,
+ pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
@@ -133,7 +132,7 @@ async def async_request_tgi(
async def async_request_trt_llm(
request_func_input: RequestFuncInput,
- pbar: Optional[tqdm] = None,
+ pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
@@ -204,7 +203,7 @@ async def async_request_trt_llm(
async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
- pbar: Optional[tqdm] = None,
+ pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
@@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
- pbar: Optional[tqdm] = None,
+ pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
@@ -367,7 +366,7 @@ async def async_request_openai_completions(
async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
- pbar: Optional[tqdm] = None,
+ pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), (
@@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
- pbar: Optional[tqdm] = None,
+ pbar: tqdm | None = None,
) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
@@ -610,7 +609,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path
):
diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
index eae8d9927ea3..5434f8b6a4e4 100644
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -2,9 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
+from benchmark_utils import TimeCollector
from tabulate import tabulate
-from benchmark_utils import TimeCollector
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
deleted file mode 100644
index 64ffa62c04d8..000000000000
--- a/benchmarks/benchmark_dataset.py
+++ /dev/null
@@ -1,1288 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This module defines a framework for sampling benchmark requests from various
-datasets. Each dataset subclass of BenchmarkDataset must implement sample
-generation. Supported dataset types include:
- - ShareGPT
- - Random (synthetic)
- - Sonnet
- - BurstGPT
- - HuggingFace
- - VisionArena
-"""
-
-import base64
-import io
-import json
-import logging
-import random
-from abc import ABC, abstractmethod
-from collections.abc import Mapping
-from copy import deepcopy
-from dataclasses import dataclass
-from functools import cache
-from io import BytesIO
-from typing import Any, Callable, Optional, Union
-
-import numpy as np
-import pandas as pd
-from datasets import load_dataset
-from PIL import Image
-from transformers import PreTrainedTokenizerBase
-
-from vllm.lora.request import LoRARequest
-from vllm.lora.utils import get_adapter_absolute_path
-from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
-
-logger = logging.getLogger(__name__)
-
-# -----------------------------------------------------------------------------
-# Data Classes
-# -----------------------------------------------------------------------------
-
-
-@dataclass
-class SampleRequest:
- """
- Represents a single inference request for benchmarking.
- """
-
- prompt: Union[str, Any]
- prompt_len: int
- expected_output_len: int
- multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
- lora_request: Optional[LoRARequest] = None
- request_id: Optional[str] = None
-
-
-# -----------------------------------------------------------------------------
-# Benchmark Dataset Base Class
-# -----------------------------------------------------------------------------
-
-
-class BenchmarkDataset(ABC):
- DEFAULT_SEED = 0
- IS_MULTIMODAL = False
-
- def __init__(
- self,
- dataset_path: Optional[str] = None,
- random_seed: int = DEFAULT_SEED,
- ) -> None:
- """
- Initialize the BenchmarkDataset with an optional dataset path and random
- seed. Args:
- dataset_path (Optional[str]): Path to the dataset. If None, it
- indicates that a default or random dataset might be used.
- random_seed (int): Seed value for reproducible shuffling or
- sampling. Defaults to DEFAULT_SEED.
- """
- self.dataset_path = dataset_path
- # Set the random seed, ensuring that a None value is replaced with the
- # default seed.
- self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
- self.data = None
-
- def apply_multimodal_chat_transformation(
- self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
- ) -> list[dict]:
- """
- Transform a prompt and optional multimodal content into a chat format.
- This method is used for chat models that expect a specific conversation
- format.
- """
- content = [{"text": prompt, "type": "text"}]
- if mm_content is not None:
- content.append(mm_content)
- return [{"role": "user", "content": content}]
-
- def load_data(self) -> None:
- """
- Load data from the dataset path into self.data.
-
- This method must be overridden by subclasses since the method to load
- data will vary depending on the dataset format and source.
-
- Raises:
- NotImplementedError: If a subclass does not implement this method.
- """
- # TODO (jenniferzhao): add support for downloading data
- raise NotImplementedError("load_data must be implemented in subclasses.")
-
- def get_random_lora_request(
- self,
- tokenizer: PreTrainedTokenizerBase,
- max_loras: Optional[int] = None,
- lora_path: Optional[str] = None,
- ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
- """
- Optionally select a random LoRA request and return its associated
- tokenizer.
-
- This method is used when LoRA parameters are provided. It randomly
- selects a LoRA based on max_loras and retrieves a cached tokenizer for
- that LoRA if available. Otherwise, it returns the base tokenizer.
-
- Args:
- tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
- LoRA is selected. max_loras (Optional[int]): The maximum number of
- LoRAs available. If None, LoRA is not used. lora_path
- (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
- is not used.
-
- Returns:
- tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
- element is a LoRARequest (or None if not applicable) and the second
- element is the tokenizer associated with the LoRA request (or the
- base tokenizer).
- """
- if max_loras is None or lora_path is None:
- return None, tokenizer
-
- # Generate a random LoRA ID in the range [1, max_loras].
- lora_id = random.randint(1, max_loras)
- lora_request = LoRARequest(
- lora_name=str(lora_id),
- lora_int_id=lora_id,
- lora_path=lora_path_on_disk(lora_path),
- )
- if lora_id not in lora_tokenizer_cache:
- lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
- # Return lora_request and the cached tokenizer if available; otherwise,
- # return the base tokenizer
- return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
-
- @abstractmethod
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- request_id_prefix: str = "",
- ) -> list[SampleRequest]:
- """
- Abstract method to generate sample requests from the dataset.
-
- Subclasses must override this method to implement dataset-specific logic
- for generating a list of SampleRequest objects.
-
- Args:
- tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
- for processing the dataset's text.
- num_requests (int): The number of sample requests to generate.
- request_id_prefix (str) The prefix of request_id.
-
- Returns:
- list[SampleRequest]: A list of sample requests generated from the
- dataset.
- """
- raise NotImplementedError("sample must be implemented in subclasses.")
-
- def maybe_oversample_requests(
- self,
- requests: list[SampleRequest],
- num_requests: int,
- request_id_prefix: str = "",
- ) -> None:
- """
- Oversamples the list of requests if its size is less than the desired
- number.
-
- Args:
- requests (List[SampleRequest]): The current list of sampled
- requests.
- num_requests (int): The target number of requests.
- request_id_prefix (str) The prefix of the request ids.
- """
- if len(requests) < num_requests:
- random.seed(self.random_seed)
- additional = deepcopy(
- random.choices(requests, k=num_requests - len(requests))
- )
- for i in range(len(additional)):
- req = additional[i]
- req.request_id = request_id_prefix + str(len(requests) + i)
- requests.extend(additional)
- logger.info("Oversampled requests to reach %d total samples.", num_requests)
-
-
-# -----------------------------------------------------------------------------
-# Utility Functions and Global Caches
-# -----------------------------------------------------------------------------
-
-
-def is_valid_sequence(
- prompt_len: int,
- output_len: int,
- min_len: int = 4,
- max_prompt_len: int = 1024,
- max_total_len: int = 2048,
- skip_min_output_len_check: bool = False,
-) -> bool:
- """
- Validate a sequence based on prompt and output lengths.
-
- Default pruning criteria are copied from the original `sample_hf_requests`
- and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
- from `sample_requests` in benchmark_throughput.py.
- """
- # Check for invalid conditions
- prompt_too_short = prompt_len < min_len
- output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
- prompt_too_long = prompt_len > max_prompt_len
- combined_too_long = (prompt_len + output_len) > max_total_len
-
- # Return True if none of the invalid conditions are met
- return not (
- prompt_too_short or output_too_short or prompt_too_long or combined_too_long
- )
-
-
-@cache
-def lora_path_on_disk(lora_path: str) -> str:
- return get_adapter_absolute_path(lora_path)
-
-
-# Global cache for LoRA tokenizers.
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
-
-
-def process_image(image: Any) -> Mapping[str, Any]:
- """
- Process a single image input and return a multimedia content dictionary.
-
- Supports three input types:
-
- 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
- containing raw image data. - Loads the bytes as a PIL.Image.Image.
-
- 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
- a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
- a dictionary with the image as a base64 data URL.
-
- 3. String input: - Treats the string as a URL or local file path. -
- Prepends "file://" if the string doesn't start with "http://" or
- "file://". - Returns a dictionary with the image URL.
-
- Raises:
- ValueError: If the input is not a supported type.
- """
- if isinstance(image, dict) and "bytes" in image:
- image = Image.open(BytesIO(image["bytes"]))
- if isinstance(image, Image.Image):
- image = convert_image_mode(image, "RGB")
- with io.BytesIO() as image_data:
- image.save(image_data, format="JPEG")
- image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
- return {
- "type": "image_url",
- "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
- }
-
- if isinstance(image, str):
- image_url = (
- image if image.startswith(("http://", "file://")) else f"file://{image}"
- )
- return {"type": "image_url", "image_url": {"url": image_url}}
-
- raise ValueError(
- f"Invalid image input {image}. Must be a PIL.Image.Image"
- " or str or dictionary with raw image bytes."
- )
-
-
-def process_video(video: Any) -> Mapping[str, Any]:
- """
- Process a single video input and return a multimedia content dictionary.
-
- Supports the following input types:
-
- 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
- containing raw video data.
-
- 2. String input: - Treats the string as a URL or local file path. -
- Prepends "file://" if the string doesn't start with "http://" or
- "file://". - Returns a dictionary with the image URL.
-
- Raises:
- ValueError: If the input is not a supported type.
- """
- if isinstance(video, dict) and "bytes" in video:
- video_bytes = video["bytes"]
- video_base64 = base64.b64encode(video_bytes).decode("utf-8")
- return {
- "type": "video_url",
- "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
- }
-
- if isinstance(video, str):
- video_url = (
- video if video.startswith(("http://", "file://")) else f"file://{video}"
- )
- return {"type": "video_url", "video_url": {"url": video_url}}
-
- raise ValueError(
- f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
- )
-
-
-# -----------------------------------------------------------------------------
-# Random Dataset Implementation (Synthetic Data)
-# -----------------------------------------------------------------------------
-
-
-class RandomDataset(BenchmarkDataset):
- # Default values copied from benchmark_serving.py for the random dataset.
- DEFAULT_PREFIX_LEN = 0
- DEFAULT_RANGE_RATIO = 0.0
- DEFAULT_INPUT_LEN = 1024
- DEFAULT_OUTPUT_LEN = 128
-
- def __init__(
- self,
- **kwargs,
- ) -> None:
- super().__init__(**kwargs)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- prefix_len: int = DEFAULT_PREFIX_LEN,
- range_ratio: float = DEFAULT_RANGE_RATIO,
- input_len: int = DEFAULT_INPUT_LEN,
- output_len: int = DEFAULT_OUTPUT_LEN,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list[SampleRequest]:
- # Enforce range_ratio < 1
- assert range_ratio < 1.0, (
- "random_range_ratio must be < 1.0 to ensure a valid sampling range"
- )
-
- vocab_size = tokenizer.vocab_size
- num_special_tokens = tokenizer.num_special_tokens_to_add()
- real_input_len = input_len - num_special_tokens
-
- prefix_token_ids = (
- np.random.randint(0, vocab_size, size=prefix_len).tolist()
- if prefix_len > 0
- else []
- )
-
- # New sampling logic: [X * (1 - b), X * (1 + b)]
- input_low = int(real_input_len * (1 - range_ratio))
- input_high = int(real_input_len * (1 + range_ratio))
- output_low = int(output_len * (1 - range_ratio))
- # Ensure the lower bound for output length is at least 1 to prevent
- # sampling 0 tokens, which can cause request failures.
- output_low = max(output_low, 1)
- output_high = int(output_len * (1 + range_ratio))
-
- # Add logging for debugging
- logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
- logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
-
- input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
- output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
- offsets = np.random.randint(0, vocab_size, size=num_requests)
-
- requests = []
- for i in range(num_requests):
- inner_seq = (
- (offsets[i] + i + np.arange(input_lens[i])) % vocab_size
- ).tolist()
- token_sequence = prefix_token_ids + inner_seq
- prompt = tokenizer.decode(token_sequence)
- # After decoding the prompt we have to encode and decode it again.
- # This is done because in some cases N consecutive tokens
- # give a string tokenized into != N number of tokens.
- # For example for GPT2Tokenizer:
- # [6880, 6881] -> ['Ġcalls', 'here'] ->
- # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
- # To avoid uncontrolled change of the prompt length,
- # the encoded sequence is truncated before being decoded again.
- total_input_len = prefix_len + int(input_lens[i])
- re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
- :total_input_len
- ]
- prompt = tokenizer.decode(re_encoded_sequence)
- total_input_len = len(re_encoded_sequence)
- requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=total_input_len,
- expected_output_len=int(output_lens[i]),
- request_id=request_id_prefix + str(i),
- )
- )
-
- return requests
-
-
-# -----------------------------------------------------------------------------
-# ShareGPT Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ShareGPTDataset(BenchmarkDataset):
- """
- Implements the ShareGPT dataset. Loads data from a JSON file and generates
- sample requests based on conversation turns.
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- with open(self.dataset_path, encoding="utf-8") as f:
- self.data = json.load(f)
- # Filter entries with at least two conversation turns.
- self.data = [
- entry
- for entry in self.data
- if "conversations" in entry and len(entry["conversations"]) >= 2
- ]
- random.seed(self.random_seed)
- random.shuffle(self.data)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- lora_path: Optional[str] = None,
- max_loras: Optional[int] = None,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- samples: list = []
- ind = 0
- for entry in self.data:
- if len(samples) >= num_requests:
- break
- prompt, completion = (
- entry["conversations"][0]["value"],
- entry["conversations"][1]["value"],
- )
-
- lora_request, tokenizer = self.get_random_lora_request(
- tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
- )
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- new_output_len = len(completion_ids) if output_len is None else output_len
- if not is_valid_sequence(
- prompt_len,
- new_output_len,
- skip_min_output_len_check=output_len is not None,
- ):
- continue
- if image_path := entry.get("image"):
- mm_content = process_image(image_path)
- elif video_path := entry.get("video"):
- mm_content = process_video(video_path)
- else:
- mm_content = None
- if enable_multimodal_chat:
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- samples.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=new_output_len,
- lora_request=lora_request,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
- return samples
-
-
-# -----------------------------------------------------------------------------
-# Custom Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class CustomDataset(BenchmarkDataset):
- """
- Implements the Custom dataset. Loads data from a JSONL file and generates
- sample requests based on conversation turns. E.g.,
- ```
- {"prompt": "What is the capital of India?"}
- {"prompt": "What is the capital of Iran?"}
- {"prompt": "What is the capital of China?"}
- ```
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- # self.data will be a list of dictionaries
- # e.g., [{"prompt": "What is the capital of India?"}, ...]
- # This will be the standardized format which load_data()
- # has to convert into depending on the filetype of dataset_path.
- # sample() will assume this standardized format of self.data
- self.data = []
-
- # Load the JSONL file
- if self.dataset_path.endswith(".jsonl"):
- jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
-
- # check if the JSONL file has a 'prompt' column
- if "prompt" not in jsonl_data.columns:
- raise ValueError("JSONL file must contain a 'prompt' column.")
-
- # Convert each row to a dictionary and append to self.data
- # This will convert the DataFrame to a list of dictionaries
- # where each dictionary corresponds to a row in the DataFrame.
- # This is the standardized format we want for self.data
- for _, row in jsonl_data.iterrows():
- self.data.append(row.to_dict())
- else:
- raise NotImplementedError(
- "Only JSONL format is supported for CustomDataset."
- )
-
- random.seed(self.random_seed)
- random.shuffle(self.data)
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- lora_path: Optional[str] = None,
- max_loras: Optional[int] = None,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- skip_chat_template: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = item["prompt"]
-
- # apply template
- if not skip_chat_template:
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
-
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
-
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Sonnet Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class SonnetDataset(BenchmarkDataset):
- """
- Simplified implementation of the Sonnet dataset. Loads poem lines from a
- text file and generates sample requests. Default values here copied from
- `benchmark_serving.py` for the sonnet dataset.
- """
-
- DEFAULT_PREFIX_LEN = 200
- DEFAULT_INPUT_LEN = 550
- DEFAULT_OUTPUT_LEN = 150
-
- def __init__(
- self,
- **kwargs,
- ) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(self) -> None:
- if not self.dataset_path:
- raise ValueError("dataset_path must be provided.")
- with open(self.dataset_path, encoding="utf-8") as f:
- self.data = f.readlines()
-
- def sample(
- self,
- tokenizer,
- num_requests: int,
- prefix_len: int = DEFAULT_PREFIX_LEN,
- input_len: int = DEFAULT_INPUT_LEN,
- output_len: int = DEFAULT_OUTPUT_LEN,
- return_prompt_formatted: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- # Calculate average token length for a poem line.
- tokenized_lines = [tokenizer(line).input_ids for line in self.data]
- avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
-
- # Build the base prompt.
- base_prompt = "Pick as many lines as you can from these poem lines:\n"
- base_msg = [{"role": "user", "content": base_prompt}]
- base_fmt = tokenizer.apply_chat_template(
- base_msg, add_generation_prompt=True, tokenize=False
- )
- base_offset = len(tokenizer(base_fmt).input_ids)
- if input_len <= base_offset:
- raise ValueError(
- f"'input_len' must be higher than the base prompt length "
- f"({base_offset})."
- )
-
- # Determine how many poem lines to use.
- num_input_lines = round((input_len - base_offset) / avg_len)
- num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
- prefix_lines = self.data[:num_prefix_lines]
-
- samples = []
- ind = 0
- while len(samples) < num_requests:
- extra_lines = random.choices(
- self.data, k=num_input_lines - num_prefix_lines
- )
- prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
- msg = [{"role": "user", "content": prompt}]
- prompt_formatted = tokenizer.apply_chat_template(
- msg, add_generation_prompt=True, tokenize=False
- )
- prompt_len = len(tokenizer(prompt_formatted).input_ids)
-
- if prompt_len <= input_len:
- samples.append(
- SampleRequest(
- prompt=prompt_formatted if return_prompt_formatted else prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- return samples
-
-
-# -----------------------------------------------------------------------------
-# BurstGPT Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class BurstGPTDataset(BenchmarkDataset):
- """
- Implements the BurstGPT dataset. Loads data from a CSV file and generates
- sample requests based on synthetic prompt generation. Only rows with Model
- "GPT-4" and positive response tokens are used.
- """
-
- def __init__(self, **kwargs) -> None:
- super().__init__(**kwargs)
- self.load_data()
-
- def load_data(
- self,
- ):
- if self.dataset_path is None:
- raise ValueError("dataset_path must be provided for loading data.")
-
- df = pd.read_csv(self.dataset_path)
- # Filter to keep only GPT-4 rows.
- gpt4_df = df[df["Model"] == "GPT-4"]
- # Remove failed requests (where Response tokens is 0 or less).
- gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
- # Sample the desired number of rows.
- self.data = gpt4_df
-
- def _sample_loaded_data(self, num_requests: int) -> list:
- if num_requests <= len(self.data):
- data = self.data.sample(n=num_requests, random_state=self.random_seed)
- else:
- data = self.data.sample(
- n=num_requests,
- random_state=self.random_seed,
- replace=True,
- )
- # Convert the dataframe to a list of lists.
- return data.values.tolist()
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- max_loras: Optional[int] = None,
- lora_path: Optional[str] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list[SampleRequest]:
- samples = []
- data = self._sample_loaded_data(num_requests=num_requests)
- for i in range(num_requests):
- input_len = int(data[i][2])
- output_len = int(data[i][3])
- lora_req, tokenizer = self.get_random_lora_request(
- tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
- )
- vocab_size = tokenizer.vocab_size
- # Generate a synthetic prompt: a list of token IDs computed as (i +
- # j) modulo vocab_size.
- token_ids = [(i + j) % vocab_size for j in range(input_len)]
- prompt = tokenizer.decode(token_ids)
- samples.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=input_len,
- expected_output_len=output_len,
- lora_request=lora_req,
- request_id=request_id_prefix + str(i),
- )
- )
- return samples
-
-
-# -----------------------------------------------------------------------------
-# HuggingFace Dataset Base Implementation
-# -----------------------------------------------------------------------------
-class HuggingFaceDataset(BenchmarkDataset):
- """Base class for datasets hosted on HuggingFace."""
-
- SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
-
- def __init__(
- self,
- dataset_path: str,
- dataset_split: str,
- no_stream: bool = False,
- dataset_subset: Optional[str] = None,
- **kwargs,
- ) -> None:
- super().__init__(dataset_path=dataset_path, **kwargs)
-
- self.dataset_split = dataset_split
- self.dataset_subset = dataset_subset
- self.load_stream = not no_stream
- self.load_data()
-
- def load_data(self) -> None:
- """Load data from HuggingFace datasets."""
- self.data = load_dataset(
- self.dataset_path,
- name=self.dataset_subset,
- split=self.dataset_split,
- streaming=self.load_stream,
- )
- self.data = self.data.shuffle(seed=self.random_seed)
-
-
-# -----------------------------------------------------------------------------
-# Conversation Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ConversationDataset(HuggingFaceDataset):
- """Dataset for conversation data with multimodal support."""
-
- SUPPORTED_DATASET_PATHS = {
- "lmms-lab/LLaVA-OneVision-Data",
- "Aeala/ShareGPT_Vicuna_unfiltered",
- }
- IS_MULTIMODAL = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- # Filter examples with at least 2 conversations
- filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
- sampled_requests = []
- dynamic_output = output_len is None
- ind = 0
-
- for item in filtered_data:
- if len(sampled_requests) >= num_requests:
- break
- conv = item["conversations"]
- prompt, completion = conv[0]["value"], conv[1]["value"]
-
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- completion_len = len(completion_ids)
- output_len = completion_len if dynamic_output else output_len
- assert isinstance(output_len, int) and output_len > 0
- if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
- continue
- mm_content = process_image(item["image"]) if "image" in item else None
- if enable_multimodal_chat:
- # Note: when chat is enabled the request prompt_len is no longer
- # accurate and we will be using request output to count the
- # actual prompt len and output len
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Vision Arena Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class VisionArenaDataset(HuggingFaceDataset):
- """
- Vision Arena Dataset.
- """
-
- DEFAULT_OUTPUT_LEN = 128
- SUPPORTED_DATASET_PATHS = {
- "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
- "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
- }
- IS_MULTIMODAL = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
- if parser_fn is None:
- raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
- prompt = parser_fn(item)
- mm_content = process_image(item["images"][0])
- prompt_len = len(tokenizer(prompt).input_ids)
- if enable_multimodal_chat:
- # Note: when chat is enabled the request prompt_len is no longer
- # accurate and we will be using request output to count the
- # actual prompt len
- prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Instruct Coder Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class InstructCoderDataset(HuggingFaceDataset):
- """
- InstructCoder Dataset.
- https://huggingface.co/datasets/likaixin/InstructCoder
-
- InstructCoder is the dataset designed for general code editing. It consists
- of 114,239 instruction-input-output triplets, and covers multiple distinct
- code editing scenario.
- """
-
- DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
- SUPPORTED_DATASET_PATHS = {
- "likaixin/InstructCoder",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = (
- f"{item['input']}\n\n{item['instruction']} Just output "
- "the code, do not include any explanation."
- )
-
- # apply template
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# MT-Bench Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class MTBenchDataset(HuggingFaceDataset):
- """
- MT-Bench Dataset.
- https://huggingface.co/datasets/philschmid/mt-bench
-
- We create a single turn dataset for MT-Bench.
- This is similar to Spec decoding benchmark setup in vLLM
- https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
- """ # noqa: E501
-
- DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
- SUPPORTED_DATASET_PATHS = {
- "philschmid/mt-bench",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- enable_multimodal_chat: bool = False,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- sampled_requests = []
-
- for i, item in enumerate(self.data):
- if len(sampled_requests) >= num_requests:
- break
- prompt = item["turns"][0]
-
- # apply template
- prompt = tokenizer.apply_chat_template(
- [{"role": "user", "content": prompt}],
- add_generation_prompt=True,
- tokenize=False,
- )
-
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- request_id=request_id_prefix + str(i),
- )
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# AIMO Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class AIMODataset(HuggingFaceDataset):
- """
- Dataset class for processing a AIMO dataset with reasoning questions.
- """
-
- SUPPORTED_DATASET_PATHS = {
- "AI-MO/aimo-validation-aime",
- "AI-MO/NuminaMath-1.5",
- "AI-MO/NuminaMath-CoT",
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- sampled_requests = []
- dynamic_output = output_len is None
- ind = 0
-
- for item in self.data:
- if len(sampled_requests) >= num_requests:
- break
- prompt, completion = item["problem"], item["solution"]
-
- prompt_ids = tokenizer(prompt).input_ids
- completion_ids = tokenizer(completion).input_ids
- prompt_len = len(prompt_ids)
- completion_len = len(completion_ids)
- output_len = completion_len if dynamic_output else output_len
- assert isinstance(output_len, int) and output_len > 0
- if dynamic_output and not is_valid_sequence(
- prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
- ):
- continue
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=None,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Next Edit Prediction Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-zeta_prompt = """### Instruction:
-You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
-
-### User Edits:
-
-{}
-
-### User Excerpt:
-
-{}
-
-### Response:
-
-""" # noqa: E501
-
-
-def _format_zeta_prompt(
- sample: dict, original_start_marker: str = "<|editable_region_start|>"
-) -> dict:
- """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
-
- This function formats examples from the NEP dataset
- into prompts and expected outputs. It could be
- further extended to support more NEP datasets.
-
- Args:
- sample: The dataset sample containing events,
- inputs, and outputs.
- original_start_marker: The marker indicating the
- start of the editable region. Defaults to
- "<|editable_region_start|>".
-
- Returns:
- A dictionary with the formatted prompts and expected outputs.
- """
- events = sample["events"]
- input = sample["input"]
- output = sample["output"]
- prompt = zeta_prompt.format(events, input)
-
- # following the original implementation, extract the focused region
- # from the raw output
- output_start_index = output.find(original_start_marker)
- output_focused_region = output[output_start_index:]
- expected_output = output_focused_region
-
- return {"prompt": prompt, "expected_output": expected_output}
-
-
-class NextEditPredictionDataset(HuggingFaceDataset):
- """
- Dataset class for processing a Next Edit Prediction dataset.
- """
-
- SUPPORTED_DATASET_PATHS = {
- "zed-industries/zeta",
- }
- MAPPING_PROMPT_FUNCS = {
- "zed-industries/zeta": _format_zeta_prompt,
- }
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- request_id_prefix: str = "",
- **kwargs,
- ):
- formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
- if formatting_prompt_func is None:
- raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
- samples = []
- for i, sample in enumerate(self.data):
- sample = formatting_prompt_func(sample)
- samples.append(
- SampleRequest(
- prompt=sample["prompt"],
- prompt_len=len(tokenizer(sample["prompt"]).input_ids),
- expected_output_len=len(
- tokenizer(sample["expected_output"]).input_ids
- ),
- request_id=request_id_prefix + str(i),
- )
- )
- if len(samples) >= num_requests:
- break
- self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
- return samples
-
-
-# -----------------------------------------------------------------------------
-# ASR Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ASRDataset(HuggingFaceDataset):
- """
- Dataset class for processing a ASR dataset for transcription.
- Tested on the following set:
-
- +----------------+----------------------------------------+--------------------------+-----------------------------+
- | Dataset | Domain | Speaking Style | hf-subset |
- +----------------+----------------------------------------+--------------------------+-----------------------------+
- | TED-LIUM | TED talks | Oratory | release1, release2, release3|
- | | | | release3-speaker-adaptation |
- | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
- | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
- | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
- | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
- | AMI | Meetings | Spontaneous | ihm, sdm |
- +----------------+----------------------------------------+--------------------------+-----------------------------+
-
- """ # noqa: E501
-
- SUPPORTED_DATASET_PATHS = {
- "openslr/librispeech_asr",
- "facebook/voxpopuli",
- "LIUM/tedlium",
- "edinburghcstr/ami",
- "speechcolab/gigaspeech",
- "kensho/spgispeech",
- }
-
- DEFAULT_OUTPUT_LEN = 128
- IS_MULTIMODAL = True
-
- # TODO Whisper-specific. Abstract interface when more models are supported.
- TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
- skip_long_audios: bool = True
-
- def sample(
- self,
- tokenizer: PreTrainedTokenizerBase,
- num_requests: int,
- output_len: Optional[int] = None,
- request_id_prefix: str = "",
- **kwargs,
- ) -> list:
- import librosa
-
- output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
- prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
- prompt_len = len(tokenizer(prompt).input_ids)
- sampled_requests = []
- skipped = 0
- ind = 0
- for item in self.data:
- if len(sampled_requests) >= num_requests:
- break
- audio = item["audio"]
- y, sr = audio["array"], audio["sampling_rate"]
- duration_s = librosa.get_duration(y=y, sr=sr)
- # Whisper max supported duration
- if self.skip_long_audios and duration_s > 30:
- skipped += 1
- continue
-
- mm_content = {"audio": (y, sr)}
- sampled_requests.append(
- SampleRequest(
- prompt=prompt,
- prompt_len=prompt_len,
- expected_output_len=output_len,
- multi_modal_data=mm_content,
- request_id=request_id_prefix + str(ind),
- )
- )
- ind += 1
- if skipped:
- logger.warning(
- "%d samples discarded from dataset due to"
- " their length being greater than"
- " what Whisper supports.",
- skipped,
- )
- self.maybe_oversample_requests(
- sampled_requests, num_requests, request_id_prefix
- )
- return sampled_requests
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index 11833fa1b3c8..626b150ee4ce 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -1,17 +1,31 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
+import time
+from unittest import mock
import numpy as np
+from benchmark_utils import TimeCollector
from tabulate import tabulate
-from benchmark_utils import TimeCollector
-from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.config import (
+ CacheConfig,
+ DeviceConfig,
+ LoadConfig,
+ ModelConfig,
+ ParallelConfig,
+ SchedulerConfig,
+ SpeculativeConfig,
+ VllmConfig,
+)
+from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.worker.gpu_input_batch import InputBatch
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
-def main(args):
+def benchmark_propose(args):
rows = []
for max_ngram in args.max_ngram:
collector = TimeCollector(TimeCollector.US)
@@ -69,10 +83,88 @@ def main(args):
)
+def benchmark_batched_propose(args):
+ NUM_SPECULATIVE_TOKENS_NGRAM = 10
+ PROMPT_LOOKUP_MIN = 5
+ PROMPT_LOOKUP_MAX = 15
+ MAX_MODEL_LEN = int(1e7)
+ DEVICE = current_platform.device_type
+
+ model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
+
+ speculative_config = SpeculativeConfig(
+ target_model_config=model_config,
+ target_parallel_config=ParallelConfig(),
+ method="ngram",
+ num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
+ prompt_lookup_max=PROMPT_LOOKUP_MAX,
+ prompt_lookup_min=PROMPT_LOOKUP_MIN,
+ )
+
+ vllm_config = VllmConfig(
+ model_config=model_config,
+ cache_config=CacheConfig(),
+ speculative_config=speculative_config,
+ device_config=DeviceConfig(device=current_platform.device_type),
+ parallel_config=ParallelConfig(),
+ load_config=LoadConfig(),
+ scheduler_config=SchedulerConfig(),
+ )
+
+ # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
+ mock_pp_group = mock.MagicMock()
+ mock_pp_group.world_size = 1
+ with mock.patch(
+ "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
+ ):
+ runner = GPUModelRunner(vllm_config, DEVICE)
+
+ # hack max model len
+ runner.max_model_len = MAX_MODEL_LEN
+ runner.drafter.max_model_len = MAX_MODEL_LEN
+
+ dummy_input_batch = InputBatch(
+ max_num_reqs=args.num_req,
+ max_model_len=MAX_MODEL_LEN,
+ max_num_batched_tokens=args.num_req * args.num_token,
+ device=DEVICE,
+ pin_memory=False,
+ vocab_size=256000,
+ block_sizes=[16],
+ )
+ dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
+ dummy_input_batch.spec_decode_unsupported_reqs = ()
+ dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
+ dummy_input_batch.token_ids_cpu = np.random.randint(
+ 0, 20, (args.num_req, args.num_token)
+ )
+
+ runner.input_batch = dummy_input_batch
+
+ sampled_token_ids = [[0]] * args.num_req
+
+ print("Starting benchmark")
+ # first run is warmup so ignore it
+ for _ in range(args.num_iteration):
+ start = time.time()
+ runner.drafter.propose(
+ sampled_token_ids,
+ dummy_input_batch.req_ids,
+ dummy_input_batch.num_tokens_no_spec,
+ dummy_input_batch.token_ids_cpu,
+ dummy_input_batch.spec_decode_unsupported_reqs,
+ )
+ end = time.time()
+ print(f"Iteration time (s): {end - start}")
+
+
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of N-gram speculative decode drafting"
)
+ parser.add_argument(
+ "--batched", action="store_true", help="consider time to prepare batch"
+ )
parser.add_argument(
"--num-iteration",
type=int,
@@ -105,8 +197,17 @@ def invoke_main() -> None:
help="Number of speculative tokens to generate",
)
args = parser.parse_args()
- main(args)
+
+ if not args.batched:
+ benchmark_propose(args)
+ else:
+ benchmark_batched_propose(args)
+"""
+# Example command lines:
+# time python3 benchmarks/benchmark_ngram_proposer.py
+# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
+""" # noqa: E501
if __name__ == "__main__":
invoke_main() # pragma: no cover
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index b5e2613de1cd..d7dc0e991c4d 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -32,7 +32,6 @@
import json
import random
import time
-from typing import Optional
from transformers import PreTrainedTokenizerBase
@@ -80,7 +79,7 @@ def sample_requests_from_dataset(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
- fixed_output_len: Optional[int],
+ fixed_output_len: int | None,
) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@@ -128,7 +127,7 @@ def sample_requests_from_random(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
- fixed_output_len: Optional[int],
+ fixed_output_len: int | None,
prefix_len: int,
) -> list[Request]:
requests = []
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index bb453791c186..769f52dbab6e 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -7,7 +7,6 @@
import json
import random
import time
-from typing import Optional
from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -24,7 +23,7 @@ def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
- fixed_output_len: Optional[int],
+ fixed_output_len: int | None,
) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 4aae755eb4e4..539ab2ed0a4d 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -31,20 +31,19 @@
import uuid
import warnings
from collections.abc import AsyncGenerator
+from contextlib import nullcontext
from dataclasses import dataclass
-from typing import Optional
import datasets
import numpy as np
import pandas as pd
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
RequestFuncInput,
RequestFuncOutput,
)
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -317,7 +316,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
- goodput_config_dict: Optional[dict[str, float]] = None,
+ goodput_config_dict: dict[str, float] | None = None,
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
total_input = 0
@@ -437,9 +436,9 @@ async def benchmark(
selected_percentile_metrics: list[str],
selected_percentiles: list[str],
ignore_eos: bool,
- max_concurrency: Optional[int],
+ max_concurrency: int | None,
structured_output_ratio: float,
- goodput_config_dict: Optional[dict[str, float]] = None,
+ goodput_config_dict: dict[str, float] | None = None,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -449,7 +448,8 @@ async def benchmark(
def prepare_extra_body(request) -> dict:
extra_body = {}
# Add the schema to the extra_body
- extra_body[request.structure_type] = request.schema
+ extra_body["structured_outputs"] = {}
+ extra_body["structured_outputs"][request.structure_type] = request.schema
return extra_body
print("Starting initial single prompt test run...")
@@ -502,15 +502,9 @@ def prepare_extra_body(request) -> dict:
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
- # This can be used once the minimum Python version is 3.10 or higher,
- # and it will simplify the code in limited_request_func.
- # semaphore = (asyncio.Semaphore(max_concurrency)
- # if max_concurrency else contextlib.nullcontext())
- semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+ semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
async def limited_request_func(request_func_input, pbar):
- if semaphore is None:
- return await request_func(request_func_input=request_func_input, pbar=pbar)
async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar)
@@ -696,11 +690,11 @@ def _eval_correctness_regex(expected, actual):
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
- if args.structure_type == "guided_json":
+ if args.structure_type == "json":
return _eval_correctness_json(expected, actual)
- elif args.structure_type == "guided_regex":
+ elif args.structure_type == "regex":
return _eval_correctness_regex(expected, actual)
- elif args.structure_type == "guided_choice":
+ elif args.structure_type == "choice":
return _eval_correctness_choice(expected, actual)
else:
return None
@@ -780,18 +774,18 @@ def main(args: argparse.Namespace):
)
if args.dataset == "grammar":
- args.structure_type = "guided_grammar"
+ args.structure_type = "grammar"
elif args.dataset == "regex":
- args.structure_type = "guided_regex"
+ args.structure_type = "regex"
elif args.dataset == "choice":
- args.structure_type = "guided_choice"
+ args.structure_type = "choice"
else:
- args.structure_type = "guided_json"
+ args.structure_type = "json"
if args.no_structured_output:
args.structured_output_ratio = 0
if args.save_results:
- result_file_name = f"{args.structured_output_ratio}guided"
+ result_file_name = f"{args.structured_output_ratio}so"
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"
@@ -909,13 +903,13 @@ def create_argument_parser():
parser.add_argument(
"--tokenizer",
type=str,
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
+ help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--tokenizer-mode",
type=str,
default="auto",
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
+ help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument(
"--num-prompts",
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 98624abdf49f..f0d661f9d534 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -6,7 +6,7 @@
import os
import time
from types import TracebackType
-from typing import Any, Optional, Union
+from typing import Any
def convert_to_pytorch_benchmark_format(
@@ -92,7 +92,7 @@ class TimeCollector:
def __init__(self, scale: int) -> None:
self.cnt: int = 0
self._sum: int = 0
- self._max: Optional[int] = None
+ self._max: int | None = None
self.scale = scale
self.start_time: int = time.monotonic_ns()
@@ -104,13 +104,13 @@ def collect(self, v: int) -> None:
else:
self._max = max(self._max, v)
- def avg(self) -> Union[float, str]:
+ def avg(self) -> float | str:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
- def max(self) -> Union[float, str]:
+ def max(self) -> float | str:
return self._max / self.scale if self._max else "N/A"
- def dump_avg_max(self) -> list[Union[float, str]]:
+ def dump_avg_max(self) -> list[float | str]:
return [self.avg(), self.max()]
def __enter__(self) -> None:
@@ -118,8 +118,8 @@ def __enter__(self) -> None:
def __exit__(
self,
- exc_type: Optional[type[BaseException]],
- exc_value: Optional[BaseException],
- exc_traceback: Optional[TracebackType],
+ exc_type: type[BaseException] | None,
+ exc_value: BaseException | None,
+ exc_traceback: TracebackType | None,
) -> None:
self.collect(time.monotonic_ns() - self.start_time)
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 9ec270bbd2e9..22fc2678fd1c 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -6,8 +6,7 @@
import itertools
import pickle as pkl
import time
-from collections.abc import Iterable
-from typing import Callable
+from collections.abc import Callable, Iterable
import torch
import torch.utils.benchmark as TBenchmark
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index a5a5b52f6039..2deebf3ddb7a 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -6,8 +6,7 @@
import itertools
import pickle as pkl
import time
-from collections.abc import Iterable
-from typing import Callable, Optional
+from collections.abc import Callable, Iterable
import torch
import torch.utils.benchmark as TBenchmark
@@ -17,7 +16,7 @@
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- w8a8_block_fp8_matmul,
+ w8a8_triton_block_scaled_mm,
)
from vllm.utils import FlexibleArgumentParser, cdiv
@@ -53,7 +52,7 @@ def bench_int8(
n: int,
label: str,
sub_label: str,
- bench_kernels: Optional[list[str]] = None,
+ bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels."""
assert dtype == torch.int8
@@ -108,7 +107,7 @@ def bench_fp8(
n: int,
label: str,
sub_label: str,
- bench_kernels: Optional[list[str]] = None,
+ bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn
@@ -158,7 +157,7 @@ def bench_fp8(
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
),
- "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
+ "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
),
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
@@ -183,7 +182,7 @@ def bench(
n: int,
label: str,
sub_label: str,
- bench_kernels: Optional[list[str]] = None,
+ bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(
dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]],
- bench_kernels: Optional[list[str]] = None,
+ bench_kernels: list[str] | None = None,
) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 2c72941cf7e5..d683835db96a 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -55,9 +55,7 @@ benchmark() {
output_len=$2
- CUDA_VISIBLE_DEVICES=0 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
@@ -65,9 +63,7 @@ benchmark() {
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
- CUDA_VISIBLE_DEVICES=1 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index 0bbf7cd2b1c8..35c86cc84522 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -38,16 +38,12 @@ wait_for_server() {
launch_chunked_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
- CUDA_VISIBLE_DEVICES=0 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--enable-chunked-prefill \
--gpu-memory-utilization 0.6 &
- CUDA_VISIBLE_DEVICES=1 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--enable-chunked-prefill \
@@ -62,18 +58,14 @@ launch_chunked_prefill() {
launch_disagg_prefill() {
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
# disagg prefill
- CUDA_VISIBLE_DEVICES=0 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
- CUDA_VISIBLE_DEVICES=1 python3 \
- -m vllm.entrypoints.openai.api_server \
- --model $model \
+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index 901524214469..d809bf1db8cb 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -3,10 +3,9 @@
import pickle as pkl
import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
-from typing import Callable, Optional
import torch
import torch.utils.benchmark as TBenchmark
@@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
def unfused_int8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
- residual: Optional[torch.Tensor],
+ residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
@@ -68,7 +67,7 @@ def unfused_int8_impl(
def unfused_fp8_impl(
rms_norm_layer: RMSNorm,
x: torch.Tensor,
- residual: Optional[torch.Tensor],
+ residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
# Norm
@@ -85,7 +84,7 @@ def unfused_fp8_impl(
def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor,
- residual: Optional[torch.Tensor],
+ residual: torch.Tensor | None,
quant_dtype: torch.dtype,
):
out, _ = ops.rms_norm_dynamic_per_token_quant(
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index 9663503e9baa..f1e504499eaf 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -4,7 +4,10 @@
import torch
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- w8a8_block_fp8_matmul,
+ apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+ CUTLASS_BLOCK_FP8_SUPPORTED,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton as vllm_triton
@@ -29,7 +32,7 @@
]
-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
"""Build runner function for w8a8 block fp8 matmul."""
factor_for_scale = 1e-2
@@ -37,37 +40,54 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
fp8_max, fp8_min = fp8_info.max, fp8_info.min
# Create random FP8 tensors
- A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
- A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+ A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
- B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
- B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+ B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+ B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
# Create scales
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
- As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
Bs = (
torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
* factor_for_scale
)
+ # SM90 CUTLASS requires row-major format for scales
+ if use_cutlass and current_platform.is_device_capability(90):
+ Bs = Bs.T.contiguous()
+
def run():
- return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+ if use_cutlass:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+ )
+ else:
+ return apply_w8a8_block_fp8_linear(
+ A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+ )
return run
+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+ available_providers.append("w8a8-block-fp8-cutlass")
+
+
@vllm_triton.testing.perf_report(
vllm_triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
- line_vals=["torch-bf16", "w8a8-block-fp8"],
- line_names=["torch-bf16", "w8a8-block-fp8"],
+ line_vals=available_providers,
+ line_names=available_providers,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs W8A8 Block FP8 GEMMs",
args={},
@@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
- else: # w8a8-block-fp8
- run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+ elif provider == "w8a8-block-fp8-triton":
+ run_w8a8_triton = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=False
+ )
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8_triton(), quantiles=quantiles
+ )
+ elif provider == "w8a8-block-fp8-cutlass":
+ run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+ M, N, K, block_size, device, use_cutlass=True
+ )
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
- lambda: run_w8a8(), quantiles=quantiles
+ lambda: run_w8a8_cutlass(), quantiles=quantiles
)
+ else:
+ raise ValueError(f"Unknown provider: {provider}")
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
diff --git a/benchmarks/kernels/bench_mxfp4_qutlass.py b/benchmarks/kernels/bench_mxfp4_qutlass.py
new file mode 100644
index 000000000000..dfc7721876a1
--- /dev/null
+++ b/benchmarks/kernels/bench_mxfp4_qutlass.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import copy
+import itertools
+
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+ "torch-bf16": dict(enabled=True),
+ "mxfp4": dict(no_a_quant=False, enabled=True),
+ "mxfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+ return (
+ deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+ * group_size**-0.5
+ )
+
+
+def _quant_weight_mxfp4(
+ b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
+):
+ weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
+ b, forward_hadamard_matrix, method="abs_max"
+ )
+ weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
+ return weight_hf_e2m1, weight_hf_scale_block
+
+
+def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
+ weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
+ b, forward_hadamard_matrix, device
+ )
+ alpha = torch.tensor([1.0], device="cuda")
+
+ if cfg["no_a_quant"]:
+ # Pre-quantize activation
+ input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
+ a, forward_hadamard_matrix, method="abs_max"
+ )
+ input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
+
+ def run():
+ return matmul_mxf4_bf16_tn(
+ input_hf_e2m1,
+ weight_hf_e2m1,
+ input_hf_scale_block,
+ weight_hf_scale_block,
+ alpha,
+ )
+
+ return run
+
+ # Quantize activation on-the-fly
+ def run():
+ input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
+ a, forward_hadamard_matrix, method="abs_max"
+ )
+ input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
+ return matmul_mxf4_bf16_tn(
+ input_hf_e2m1,
+ weight_hf_e2m1,
+ input_hf_scale_block,
+ weight_hf_scale_block,
+ alpha,
+ )
+
+ return run
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[
+ 1,
+ 4,
+ 8,
+ 16,
+ 32,
+ 64,
+ 128,
+ 256,
+ 512,
+ 1024,
+ 2048,
+ 4096,
+ 8192,
+ 16384,
+ 24576,
+ 32768,
+ ],
+ x_log=False,
+ line_arg="provider",
+ line_vals=_enabled,
+ line_names=_enabled,
+ ylabel="TFLOP/s (larger is better)",
+ plot_name="BF16 vs MXFP4 GEMMs",
+ args={},
+ )
+)
+def benchmark(batch_size, provider, N, K, had_size):
+ M = batch_size
+ device = "cuda"
+ dtype = torch.bfloat16
+
+ a = torch.randn((M, K), device=device, dtype=dtype)
+ b = torch.randn((N, K), device=device, dtype=dtype)
+ forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "torch-bf16":
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
+ )
+ else:
+ cfg = PROVIDER_CFGS[provider]
+ run_quant = build_mxfp4_runner(
+ cfg, a, b, forward_hadamard_matrix, dtype, device
+ )
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: run_quant(), rep=200, quantiles=quantiles
+ )
+
+ to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+ return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+ out = []
+ for model, tp_size in itertools.product(args.models, args.tp_sizes):
+ for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+ KN[tp_dim] //= tp_size
+ KN.append(model)
+ out.append(KN)
+ return out
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=["meta-llama/Llama-3.3-70B-Instruct"],
+ choices=list(WEIGHT_SHAPES.keys()),
+ )
+ parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+ args = parser.parse_args()
+
+ for K, N, model in prepare_shapes(args):
+ for had_size in [32, 64, 128]:
+ print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
+ benchmark.run(
+ print_data=True,
+ show_plots=True,
+ save_path=f"bench_mxfp4_res_n{N}_k{K}",
+ N=N,
+ K=K,
+ had_size=had_size,
+ )
+
+ print("Benchmark finished!")
diff --git a/benchmarks/kernels/bench_nvfp4_gemm.py b/benchmarks/kernels/bench_nvfp4_gemm.py
index 9e832c9faa8e..6b19eb113f3e 100644
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
@@ -3,6 +3,7 @@
import argparse
import copy
import itertools
+import os
import torch
from weight_shapes import WEIGHT_SHAPES
@@ -23,21 +24,45 @@
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+ "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
+ "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
}
+_needs_fbgemm = any(
+ v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
+)
+if _needs_fbgemm:
+ try:
+ from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
+ triton_scale_nvfp4_quant,
+ )
+ except ImportError:
+ print(
+ "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
+ "These providers will be skipped. Please install fbgemm_gpu with: "
+ "'pip install fbgemm-gpu-genai' to run them."
+ )
+ # Disable FBGEMM providers so the benchmark can run.
+ for cfg in PROVIDER_CFGS.values():
+ if cfg.get("fbgemm"):
+ cfg["enabled"] = False
+
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
-def _quant_weight_nvfp4(b: torch.Tensor, device: str):
+def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
# Compute global scale for weight
b_amax = torch.abs(b).max().to(torch.float32)
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
- b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
+ if "fbgemm" in cfg and cfg["fbgemm"]:
+ b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
+ else:
+ b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
return b_fp4, scale_b_fp4, b_global_scale
def build_nvfp4_runner(cfg, a, b, dtype, device):
- b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
+ b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
# Compute global scale for activation
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
@@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
# Alpha for the GEMM operation
alpha = 1.0 / (a_global_scale * b_global_scale)
+ if "fbgemm" in cfg and cfg["fbgemm"]:
+ if cfg["no_a_quant"]:
+ a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
+
+ def run():
+ return torch.ops.fbgemm.f4f4bf16(
+ a_fp4,
+ b_fp4,
+ scale_a_fp4,
+ scale_b_fp4,
+ global_scale=alpha,
+ use_mx=False,
+ )
+
+ return run
+ else:
+
+ def run():
+ a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
+ return torch.ops.fbgemm.f4f4bf16(
+ a_fp4,
+ b_fp4,
+ scale_a_fp4,
+ scale_b_fp4,
+ global_scale=alpha,
+ use_mx=False,
+ )
+
+ return run
if cfg["no_a_quant"]:
# Pre-quantize activation
@@ -130,10 +184,13 @@ def prepare_shapes(args):
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+ save_dir = f"bench_nvfp4_res_n{N}_k{K}"
+ os.makedirs(save_dir, exist_ok=True)
+
benchmark.run(
print_data=True,
show_plots=True,
- save_path=f"bench_nvfp4_res_n{N}_k{K}",
+ save_path=save_dir,
N=N,
K=K,
)
diff --git a/benchmarks/kernels/bench_nvfp4_qutlass.py b/benchmarks/kernels/bench_nvfp4_qutlass.py
new file mode 100644
index 000000000000..6fecc816f946
--- /dev/null
+++ b/benchmarks/kernels/bench_nvfp4_qutlass.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import copy
+import itertools
+
+import torch
+from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
+from vllm._custom_ops import fusedQuantizeNv
+from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
+from vllm.triton_utils import triton
+
+PROVIDER_CFGS = {
+ "torch-bf16": dict(enabled=True),
+ "nvfp4": dict(no_a_quant=False, enabled=True),
+ "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
+ return (
+ deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
+ * group_size**-0.5
+ )
+
+
+def _quant_weight_nvfp4(
+ b: torch.Tensor,
+ forward_hadamard_matrix: torch.Tensor,
+ global_scale: torch.Tensor,
+ device: str,
+ M: int,
+ N: int,
+ K: int,
+):
+ weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
+ b, forward_hadamard_matrix, global_scale
+ )
+ weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
+ -1, K // 16
+ )
+ return weight_hf_e2m1, weight_hf_scale_block
+
+
+def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
+ alpha = torch.tensor([1.0], device="cuda")
+ global_scale = torch.tensor([1.0], device="cuda")
+ weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
+ b, forward_hadamard_matrix, global_scale, device, M, N, K
+ )
+
+ if cfg["no_a_quant"]:
+ # Pre-quantize activation
+ input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
+ a, forward_hadamard_matrix, global_scale
+ )
+ input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
+ -1, K // 16
+ )
+
+ def run():
+ return ops.cutlass_scaled_fp4_mm(
+ input_hf_e2m1,
+ weight_hf_e2m1,
+ input_hf_scale_block,
+ weight_hf_scale_block,
+ alpha,
+ torch.bfloat16,
+ )
+
+ return run
+
+ # Quantize activation on-the-fly
+ def run():
+ input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
+ a, forward_hadamard_matrix, global_scale
+ )
+ input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
+ -1, K // 16
+ )
+ return ops.cutlass_scaled_fp4_mm(
+ input_hf_e2m1,
+ weight_hf_e2m1,
+ input_hf_scale_block,
+ weight_hf_scale_block,
+ alpha,
+ torch.bfloat16,
+ )
+
+ return run
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[
+ 1,
+ 4,
+ 8,
+ 16,
+ 32,
+ 64,
+ 128,
+ 256,
+ 512,
+ 1024,
+ 2048,
+ 4096,
+ 8192,
+ 16384,
+ 24576,
+ 32768,
+ ],
+ x_log=False,
+ line_arg="provider",
+ line_vals=_enabled,
+ line_names=_enabled,
+ ylabel="TFLOP/s (larger is better)",
+ plot_name="BF16 vs NVFP4 GEMMs",
+ args={},
+ )
+)
+def benchmark(batch_size, provider, N, K, had_size):
+ M = batch_size
+ device = "cuda"
+ dtype = torch.bfloat16
+
+ a = torch.randn((M, K), device=device, dtype=dtype)
+ b = torch.randn((N, K), device=device, dtype=dtype)
+ forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "torch-bf16":
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
+ )
+ else:
+ cfg = PROVIDER_CFGS[provider]
+ run_quant = build_nvfp4_runner(
+ cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
+ )
+ ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+ lambda: run_quant(), rep=200, quantiles=quantiles
+ )
+
+ to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+ return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+def prepare_shapes(args):
+ out = []
+ for model, tp_size in itertools.product(args.models, args.tp_sizes):
+ for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+ KN[tp_dim] //= tp_size
+ KN.append(model)
+ out.append(KN)
+ return out
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=["meta-llama/Llama-3.3-70B-Instruct"],
+ choices=list(WEIGHT_SHAPES.keys()),
+ )
+ parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
+ args = parser.parse_args()
+
+ for K, N, model in prepare_shapes(args):
+ for had_size in [16, 32, 64, 128]:
+ print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
+ benchmark.run(
+ print_data=True,
+ show_plots=True,
+ save_path=f"bench_nvfp4_res_n{N}_k{K}",
+ N=N,
+ K=K,
+ had_size=had_size,
+ )
+
+ print("Benchmark finished!")
diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py
index 923d678f1f2d..d33b84fc3601 100644
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -1,15 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
-from typing import Callable
+from collections.abc import Callable
+from unittest.mock import patch
+import pandas as pd
import torch
-from vllm import _custom_ops as ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+
+
+def with_triton_mode(fn):
+ """Temporarily force the Triton fallback path"""
+
+ def wrapped(*args, **kwargs):
+ with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+ return fn(*args, **kwargs)
+
+ return wrapped
# TODO(luka): use standalone_compile utility
@@ -21,78 +33,238 @@ def inner(*args):
return inner
-torch._dynamo.config.recompile_limit = 8888
-compilation_config = CompilationConfig(custom_ops=["none"])
-with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
- torch_per_token_quant_fp8 = torch.compile(
- QuantFP8(False, GroupShape.PER_TOKEN),
- fullgraph=True,
- dynamic=False, # recompile for different shapes
- )
+def bench_compile(fn: Callable):
+ # recompile for different shapes
+ fwd = torch.compile(fn, fullgraph=True, dynamic=False)
# First dim is explicitly dynamic to simulate vLLM usage
- torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
+ return with_dyn_arg(fwd, 0, 0)
-def cuda_per_token_quant_fp8(
- input: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
- return ops.scaled_fp8_quant(input)
+torch._dynamo.config.recompile_limit = 8888
-def calculate_diff(batch_size: int, seq_len: int):
- """Calculate difference between Triton and CUDA implementations."""
+def calculate_diff(
+ batch_size: int,
+ hidden_size: int,
+ group_shape: GroupShape,
+ dtype: torch.dtype,
+):
+ """Calculate the difference between Inductor and CUDA implementations."""
device = torch.device("cuda")
- x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
+ x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
+
+ quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
- torch_out, torch_scale = torch_per_token_quant_fp8(x)
- cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
+ torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
+ torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
+ cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
- if torch.allclose(
- cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
- ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
+ try:
+ torch.testing.assert_close(
+ cuda_out.to(torch.float32),
+ torch_out.to(torch.float32),
+ rtol=1e-3,
+ atol=1e-5,
+ )
+ torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
+ torch.testing.assert_close(
+ cuda_out.to(torch.float32),
+ torch_eager_out.to(torch.float32),
+ rtol=1e-3,
+ atol=1e-5,
+ )
+ torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
print("✅ All implementations match")
- else:
+ except AssertionError as e:
print("❌ Implementations differ")
+ print(e)
-batch_size_range = [1, 16, 32, 64, 128]
-seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+configs = []
-configs = list(itertools.product(batch_size_range, seq_len_range))
-
-@triton.testing.perf_report(
- triton.testing.Benchmark(
- x_names=["batch_size", "seq_len"],
- x_vals=configs,
- line_arg="provider",
- line_vals=["torch", "cuda"],
- line_names=["Torch", "CUDA"],
- styles=[("blue", "-"), ("green", "-")],
- ylabel="us",
- plot_name="per-token-dynamic-quant-fp8-performance",
- args={},
- )
-)
-def benchmark_quantization(batch_size, seq_len, provider):
- dtype = torch.float16
+def benchmark_quantization(
+ batch_size,
+ hidden_size,
+ provider,
+ group_shape: GroupShape,
+ col_major: bool,
+ dtype: torch.dtype,
+):
device = torch.device("cuda")
- x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
+ x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
+ quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
if provider == "torch":
- fn = lambda: torch_per_token_quant_fp8(x.clone())
+ fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
elif provider == "cuda":
- fn = lambda: cuda_per_token_quant_fp8(x.clone())
+ fn = lambda: quant_fp8.forward_cuda(x.clone())
+ elif provider == "triton":
+ if not group_shape.is_per_group():
+ # Triton only supported for per-group
+ return 0, 0, 0
+
+ fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+# TODO(luka) extract to utils
+def compute_geomean_speedups(
+ df: pd.DataFrame,
+ baseline_col: str,
+ speedup_cols: list[str],
+ groupby_cols: list[str] | None = None,
+) -> pd.DataFrame:
+ """
+ Compute geometric mean speedups over a baseline column.
+
+ Args:
+ df: Input dataframe
+ baseline_col: Column to use as baseline
+ speedup_cols: Columns to compute speedups for
+ groupby_cols: Columns to group by. If None, compute over entire df.
+
+ Returns:
+ pd.DataFrame with geometric mean speedups
+ """
+ from scipy.stats import gmean
+
+ def geo_speedup(group: pd.DataFrame) -> pd.Series:
+ ratios = {
+ col: (group[baseline_col] / group[col]).values for col in speedup_cols
+ }
+ return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
+
+ if groupby_cols is None:
+ result = geo_speedup(df).to_frame().T
+ else:
+ result = (
+ df.groupby(groupby_cols)
+ .apply(geo_speedup, include_groups=False)
+ .reset_index()
+ )
+
+ return result
+
+
if __name__ == "__main__":
- calculate_diff(batch_size=4, seq_len=4096)
- benchmark_quantization.run(print_data=True)
+ parser = FlexibleArgumentParser(
+ description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
+ )
+ parser.add_argument("-c", "--check", action="store_true")
+ parser.add_argument(
+ "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+ )
+ parser.add_argument(
+ "--hidden-sizes",
+ type=int,
+ nargs="+",
+ default=[896, 1024, 2048, 4096, 7168],
+ help="Hidden sizes to benchmark",
+ )
+ parser.add_argument(
+ "--batch-sizes",
+ type=int,
+ nargs="+",
+ default=[1, 16, 128, 512, 1024],
+ help="Batch sizes to benchmark",
+ )
+ parser.add_argument(
+ "--group-sizes",
+ type=int,
+ nargs="+",
+ default=None,
+ help="Group sizes for GroupShape(1,N) to benchmark. "
+ "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
+ )
+ parser.add_argument(
+ "--no-column-major",
+ action="store_true",
+ help="Disable column-major scales testing",
+ )
+
+ args = parser.parse_args()
+ assert args
+
+ dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+ hidden_sizes = args.hidden_sizes
+ batch_sizes = args.batch_sizes
+
+ if args.group_sizes is not None:
+ group_shapes = []
+ for size in args.group_sizes:
+ if size == 0:
+ group_shapes.append(GroupShape.PER_TENSOR)
+ elif size == -1:
+ group_shapes.append(GroupShape.PER_TOKEN)
+ else:
+ group_shapes.append(GroupShape(1, size))
+ else:
+ group_shapes = [
+ GroupShape.PER_TENSOR,
+ GroupShape.PER_TOKEN,
+ GroupShape(1, 64),
+ GroupShape(1, 128),
+ ]
+
+ column_major_scales = [False] if args.no_column_major else [True, False]
+
+ config_gen = itertools.product(
+ group_shapes,
+ column_major_scales,
+ batch_sizes,
+ hidden_sizes,
+ )
+
+ # filter out column-major scales for non-group, reverse order
+ configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
+
+ print(f"Running {len(configs)} configurations:")
+ print(f" Hidden sizes: {hidden_sizes}")
+ print(f" Batch sizes: {batch_sizes}")
+ print(f" Group shapes: {[str(g) for g in group_shapes]}")
+ print(f" Column major scales: {column_major_scales}")
+ print()
+
+ if args.check:
+ for group_shape in group_shapes:
+ group_size = group_shape[1]
+ print(f"{group_size=}")
+ calculate_diff(
+ batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
+ )
+
+ benchmark = triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
+ x_vals=configs,
+ line_arg="provider",
+ line_vals=["torch", "cuda", "triton"],
+ line_names=["Torch (Compiled)", "CUDA", "Triton"],
+ styles=[("blue", "-"), ("green", "-"), ("black", "-")],
+ ylabel="us",
+ plot_name="QuantFP8 performance",
+ args={},
+ )
+ )(benchmark_quantization)
+
+ df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
+
+ # Print geomean speedups
+ geo_table_grouped = compute_geomean_speedups(
+ df,
+ baseline_col="Torch (Compiled)",
+ speedup_cols=["CUDA", "Triton"],
+ groupby_cols=["col_major", "group_shape"],
+ )
+
+ print("Speedup over Torch (Compiled)")
+ print(geo_table_grouped.to_string(index=False))
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
index 93edbcc9391f..7662655b5efa 100644
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -10,7 +10,8 @@
from vllm.model_executor.custom_op import CustomOp
from vllm.platforms import current_platform
from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
index 35c20ee41b9a..726a2a371d10 100644
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@@ -13,6 +13,10 @@
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import (
+ fp8_w8a8_moe_quant_config,
+ nvfp4_moe_quant_config,
+)
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
@@ -140,6 +144,12 @@ def run_triton_moe(
a_fp8_scale: torch.Tensor,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_fp8_scale,
+ )
+
for _ in range(num_repeats):
fused_experts(
a,
@@ -147,10 +157,7 @@ def run_triton_moe(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_fp8_scale,
+ quant_config=quant_config,
)
def run_cutlass_moe_fp4(
@@ -172,25 +179,27 @@ def run_cutlass_moe_fp4(
device: torch.device,
num_repeats: int,
):
+ quant_config = nvfp4_moe_quant_config(
+ a1_gscale=a1_gs,
+ a2_gscale=a2_gs,
+ w1_scale=w1_blockscale,
+ w2_scale=w2_blockscale,
+ g1_alphas=w1_gs,
+ g2_alphas=w2_gs,
+ )
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp4", color="green"):
cutlass_moe_fp4(
a=a,
- a1_gscale=a1_gs,
- a2_gscale=a2_gs,
w1_fp4=w1_fp4,
- w1_blockscale=w1_blockscale,
- w1_alphas=w1_gs,
w2_fp4=w2_fp4,
- w2_blockscale=w2_blockscale,
- w2_alphas=w2_gs,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
- device=device,
+ quant_config=quant_config,
)
def run_cutlass_from_graph(
@@ -211,26 +220,29 @@ def run_cutlass_from_graph(
e: int,
device: torch.device,
):
+ quant_config = nvfp4_moe_quant_config(
+ a1_gscale=a1_gs,
+ a2_gscale=a2_gs,
+ w1_scale=w1_blockscale,
+ w2_scale=w2_blockscale,
+ g1_alphas=w1_gs,
+ g2_alphas=w2_gs,
+ )
+
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
return cutlass_moe_fp4(
a=a,
- a1_gscale=a1_gs,
w1_fp4=w1_fp4,
- w1_blockscale=w1_blockscale,
- w1_alphas=w1_alphas,
- a2_gscale=a2_gs,
w2_fp4=w2_fp4,
- w2_blockscale=w2_blockscale,
- w2_alphas=w2_alphas,
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
n=n,
k=k,
e=num_experts,
- device=device,
+ quant_config=quant_config,
)
def run_triton_from_graph(
@@ -246,16 +258,18 @@ def run_triton_from_graph(
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_fp8_scale,
+ )
return fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_fp8_scale,
+ quant_config=quant_config,
)
def replay_graph(graph, num_repeats):
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
new file mode 100644
index 000000000000..b419b2fa0e3e
--- /dev/null
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
+kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
+but use different quantization strategies and backends.
+"""
+
+import nvtx
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+# Weight shapes for different models: [num_experts, topk, hidden_size,
+# intermediate_size]
+WEIGHT_SHAPES_MOE = {
+ "mixtral-8x7b": [
+ [8, 2, 4096, 14336],
+ ],
+ "deepseek-v2": [
+ [160, 6, 5120, 12288],
+ ],
+ "custom-small": [
+ [8, 2, 2048, 7168],
+ ],
+ "glm45-fp8": [
+ [128, 8, 4096, 1408],
+ ],
+ "Llama-4-Maverick-17B-128E-Instruct-FP8": [
+ [128, 1, 5120, 8192],
+ ],
+}
+
+DEFAULT_MODELS = [
+ "mixtral-8x7b",
+]
+
+DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False, True]
+PER_OUT_CH_OPTS = [False, True]
+
+FP8_DTYPE = current_platform.fp8_dtype()
+
+
+def bench_run(
+ results: list,
+ model: str,
+ num_experts: int,
+ topk: int,
+ per_act_token: bool,
+ per_out_ch: bool,
+ mkn: tuple[int, int, int],
+):
+ (m, k, n) = mkn
+
+ dtype = torch.half
+ device = "cuda"
+
+ # Create input activations
+ a = torch.randn((m, k), device=device, dtype=dtype) / 10
+
+ # Create weights
+ w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
+ w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
+
+ # Create FP8 quantized weights and scales for both kernels
+ w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
+ w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
+
+ # Create scales based on quantization strategy
+ if per_out_ch:
+ # Per-channel quantization
+ w1_scale = torch.empty(
+ (num_experts, 2 * n, 1), device=device, dtype=torch.float32
+ )
+ w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
+ else:
+ # Per-tensor quantization
+ w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+ w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
+
+ # Quantize weights
+ for expert in range(num_experts):
+ if per_out_ch:
+ # Per-channel quantization - not yet implemented properly
+ # For now, fall back to per-tensor quantization
+ w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
+ w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
+ # Expand scalar scales to the expected per-channel shape
+ w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
+ w2_scale[expert] = w2_scale_temp.expand(k, 1)
+ else:
+ # Per-tensor quantization
+ w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
+ w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
+ # Store scalar scales in [1, 1] tensors
+ w1_scale[expert, 0, 0] = w1_scale_temp
+ w2_scale[expert, 0, 0] = w2_scale_temp
+
+ # Prepare weights for CUTLASS (no transpose needed)
+ w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
+ w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
+
+ # Create router scores and get topk
+ score = torch.randn((m, num_experts), device=device, dtype=dtype)
+ topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+
+ # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
+ # Force per-tensor quantization for all cases to match working e2e setup
+ a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
+ a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
+
+ # Force per-tensor quantization for all cases
+ per_act_token = False
+
+ # Create stride tensors for CUTLASS
+ ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
+ ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
+ c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
+ c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
+
+ def run_triton_moe(
+ a: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ w1_scale: torch.Tensor,
+ w2_scale: torch.Tensor,
+ a1_scale: torch.Tensor,
+ a2_scale: torch.Tensor,
+ num_repeats: int,
+ ):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ per_act_token_quant=per_act_token,
+ per_out_ch_quant=per_out_ch,
+ )
+
+ for _ in range(num_repeats):
+ fused_experts(
+ a,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ quant_config=quant_config,
+ )
+
+ def run_cutlass_moe_fp8(
+ a: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
+ ab_strides1: torch.Tensor,
+ ab_strides2: torch.Tensor,
+ c_strides1: torch.Tensor,
+ c_strides2: torch.Tensor,
+ w1_scale: torch.Tensor,
+ w2_scale: torch.Tensor,
+ a1_scale: torch.Tensor,
+ a2_scale: torch.Tensor,
+ num_repeats: int,
+ ):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ per_act_token_quant=per_act_token,
+ per_out_ch_quant=per_out_ch,
+ )
+
+ for _ in range(num_repeats):
+ with nvtx.annotate("cutlass_moe_fp8", color="blue"):
+ cutlass_moe_fp8(
+ a=a,
+ w1_q=w1,
+ w2_q=w2,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ ab_strides1=ab_strides1,
+ ab_strides2=ab_strides2,
+ c_strides1=c_strides1,
+ c_strides2=c_strides2,
+ quant_config=quant_config,
+ activation="silu",
+ global_num_experts=num_experts,
+ )
+
+ # Pre-create quantization config to avoid creating it inside CUDA graph
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ per_act_token_quant=per_act_token,
+ per_out_ch_quant=per_out_ch,
+ )
+
+ # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
+ cutlass_stream = torch.cuda.Stream()
+ cutlass_graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+ # Capture 10 invocations like benchmark_moe.py
+ for _ in range(10):
+ cutlass_moe_fp8(
+ a=a,
+ w1_q=w1_fp8q_cutlass,
+ w2_q=w2_fp8q_cutlass,
+ topk_weights=topk_weights,
+ topk_ids=topk_ids,
+ ab_strides1=ab_strides1,
+ ab_strides2=ab_strides2,
+ c_strides1=c_strides1,
+ c_strides2=c_strides2,
+ quant_config=quant_config,
+ activation="silu",
+ global_num_experts=num_experts,
+ )
+ torch.cuda.synchronize()
+
+ # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
+ triton_stream = torch.cuda.Stream()
+ triton_graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(triton_graph, stream=triton_stream):
+ # Capture 10 invocations like benchmark_moe.py
+ for _ in range(10):
+ fused_experts(
+ a,
+ w1_fp8q,
+ w2_fp8q,
+ topk_weights,
+ topk_ids,
+ quant_config=quant_config,
+ )
+ torch.cuda.synchronize()
+
+ def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
+ """Benchmark CUDA graph using events like benchmark_moe.py"""
+ # Warmup
+ for _ in range(num_warmup):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ # Timing
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+
+ latencies = []
+ for _ in range(num_iters):
+ torch.cuda.synchronize()
+ start_event.record()
+ graph.replay()
+ end_event.record()
+ end_event.synchronize()
+ latencies.append(start_event.elapsed_time(end_event))
+
+ # Divide by 10 since graph contains 10 calls
+ return sum(latencies) / (num_iters * 10)
+
+ # Benchmark parameters
+ num_warmup = 5
+ num_iters = 100
+
+ # Benchmark only CUDA graphs (more reliable and faster)
+ # Benchmark Triton MoE with CUDA graphs
+ triton_graph_time = bench_cuda_graph(
+ triton_graph, num_warmup=num_warmup, num_iters=num_iters
+ )
+
+ # Benchmark CUTLASS MoE with CUDA graphs
+ cutlass_graph_time = bench_cuda_graph(
+ cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
+ )
+
+ # Convert ms to us and return results
+ triton_time_us = triton_graph_time * 1000
+ cutlass_time_us = cutlass_graph_time * 1000
+
+ return {
+ "batch_size": m,
+ "triton_time_us": triton_time_us,
+ "cutlass_time_us": cutlass_time_us,
+ }
+
+
+def main(args):
+ print("Benchmarking models:")
+ for i, model in enumerate(args.models):
+ print(f"[{i}] {model}")
+
+ all_results = []
+
+ for model in args.models:
+ for tp in args.tp_sizes:
+ for layer in WEIGHT_SHAPES_MOE[model]:
+ num_experts = layer[0]
+ topk = layer[1]
+ size_k = layer[2]
+ size_n = layer[3] // tp
+
+ if len(args.limit_k) > 0 and size_k not in args.limit_k:
+ continue
+
+ if len(args.limit_n) > 0 and size_n not in args.limit_n:
+ continue
+
+ for per_act_token in args.per_act_token_opts:
+ for per_out_ch in args.per_out_ch_opts:
+ print(
+ f"\n=== {model}, experts={num_experts}, topk={topk},"
+ f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
+ )
+
+ config_results = []
+ for size_m in args.batch_sizes:
+ mkn = (size_m, size_k, size_n)
+ result = bench_run(
+ [], # Not used anymore
+ model,
+ num_experts,
+ topk,
+ per_act_token,
+ per_out_ch,
+ mkn,
+ )
+ if result:
+ config_results.append(result)
+
+ # Print results table for this configuration
+ if config_results:
+ print(
+ f"\n{'Batch Size':<12}"
+ f"{'Triton (us)':<15}"
+ f"{'CUTLASS (us)':<15}"
+ )
+ print("-" * 45)
+ for result in config_results:
+ print(
+ f"{result['batch_size']:<12}"
+ f"{result['triton_time_us']:<15.2f}"
+ f"{result['cutlass_time_us']:<15.2f}"
+ )
+
+ all_results.extend(config_results)
+
+ print(f"\nTotal benchmarks completed: {len(all_results)}")
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
+ across specified models/shapes/batches
+
+ Example usage:
+ python benchmark_cutlass_moe_fp8.py \
+ --model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
+ --tp-sizes 8 \
+ --batch-size 2 4 8 \
+ --per-act-token-opts false \
+ --per-out-ch-opts false
+
+ """
+ )
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=DEFAULT_MODELS,
+ choices=WEIGHT_SHAPES_MOE.keys(),
+ )
+ parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
+ parser.add_argument(
+ "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+ )
+ parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+ parser.add_argument(
+ "--per-act-token-opts",
+ nargs="+",
+ type=lambda x: x.lower() == "true",
+ default=[False, True],
+ help="Per-activation token quantization options (true/false)",
+ )
+ parser.add_argument(
+ "--per-out-ch-opts",
+ nargs="+",
+ type=lambda x: x.lower() == "true",
+ default=[False, True],
+ help="Per-output channel quantization options (true/false)",
+ )
+
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
new file mode 100644
index 000000000000..df06a940e6d4
--- /dev/null
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -0,0 +1,508 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark script for device communicators:
+CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
+and SymmMemCommunicator (multimem, two-shot).
+
+for NCCL symmetric memory you need to set the environment variables
+NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
+not use fast NVLS implementation for all reduce.
+
+Usage:
+ torchrun --nproc_per_node= benchmark_device_communicators.py [options]
+
+Example:
+ torchrun --nproc_per_node=2 benchmark_device_communicators.py
+ --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
+"""
+
+import json
+import os
+import time
+from collections.abc import Callable
+from contextlib import nullcontext
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.pynccl import (
+ PyNcclCommunicator,
+ register_nccl_symmetric_ops,
+)
+from vllm.distributed.device_communicators.pynccl_allocator import (
+ set_graph_pool_id,
+)
+from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+# Default sequence lengths to benchmark
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+
+# Fixed hidden size and dtype for all benchmarks
+HIDDEN_SIZE = 8192
+BENCHMARK_DTYPE = torch.bfloat16
+
+# CUDA graph settings
+CUDA_GRAPH_CAPTURE_CYCLES = 10
+
+
+class CommunicatorBenchmark:
+ """Benchmark class for testing device communicators."""
+
+ def __init__(
+ self,
+ rank: int,
+ world_size: int,
+ device: torch.device,
+ cpu_group: ProcessGroup,
+ sequence_lengths: list[int],
+ ):
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.cpu_group = cpu_group
+
+ # Calculate max_size_override based on largest sequence length
+ max_seq_len = max(sequence_lengths)
+ max_tensor_elements = max_seq_len * HIDDEN_SIZE
+ self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
+
+ # Initialize communicators
+ self.custom_allreduce = None
+ self.pynccl_comm = None
+ self.symm_mem_comm = None
+ self.symm_mem_comm_multimem = None
+ self.symm_mem_comm_two_shot = None
+
+ self._init_communicators()
+
+ def _init_communicators(self):
+ """Initialize all available communicators."""
+ try:
+ self.custom_allreduce = CustomAllreduce(
+ group=self.cpu_group,
+ device=self.device,
+ max_size=self.max_size_override,
+ )
+ if not self.custom_allreduce.disabled:
+ logger.info("Rank %s: CustomAllreduce initialized", self.rank)
+ else:
+ logger.info("Rank %s: CustomAllreduce disabled", self.rank)
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
+ )
+ self.custom_allreduce = None
+
+ try:
+ self.pynccl_comm = PyNcclCommunicator(
+ group=self.cpu_group, device=self.device
+ )
+ if not self.pynccl_comm.disabled:
+ logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+ register_nccl_symmetric_ops(self.pynccl_comm)
+ else:
+ logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
+ self.pynccl_comm = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
+ )
+ self.pynccl_comm = None
+
+ # Initialize variants for SymmMemCommunicator
+ try:
+ self.symm_mem_comm_multimem = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=True,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_multimem.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_multimem = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_multimem = None
+
+ try:
+ self.symm_mem_comm_two_shot = SymmMemCommunicator(
+ group=self.cpu_group,
+ device=self.device,
+ force_multimem=False,
+ max_size_override=self.max_size_override,
+ )
+ if not self.symm_mem_comm_two_shot.disabled:
+ logger.info(
+ "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
+ )
+ else:
+ self.symm_mem_comm_two_shot = None
+ except Exception as e:
+ logger.warning(
+ "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
+ self.rank,
+ e,
+ )
+ self.symm_mem_comm_two_shot = None
+
+ def benchmark_allreduce(
+ self, sequence_length: int, num_warmup: int, num_trials: int
+ ) -> dict[str, float]:
+ """Benchmark allreduce operations for all available communicators."""
+
+ results = {}
+
+ # Define communicators with their benchmark functions
+ communicators = []
+
+ if self.custom_allreduce is not None:
+ comm = self.custom_allreduce
+ # CustomAllreduce one-shot
+ communicators.append(
+ (
+ "ca_1stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "1stage", # env variable value
+ )
+ )
+ # CustomAllreduce two-shot
+ communicators.append(
+ (
+ "ca_2stage",
+ lambda t, c=comm: c.custom_all_reduce(t),
+ lambda t, c=comm: c.should_custom_ar(t),
+ comm.capture(),
+ "2stage", # env variable value
+ )
+ )
+
+ if self.pynccl_comm is not None:
+ comm = self.pynccl_comm
+ communicators.append(
+ (
+ "pynccl",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t: True, # Always available if initialized
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+ communicators.append(
+ (
+ "pynccl-symm",
+ lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
+ lambda t: True, # Always available if initialized
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_multimem is not None:
+ comm = self.symm_mem_comm_multimem
+ communicators.append(
+ (
+ "symm_mem_multimem",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ if self.symm_mem_comm_two_shot is not None:
+ comm = self.symm_mem_comm_two_shot
+ communicators.append(
+ (
+ "symm_mem_two_shot",
+ lambda t, c=comm: c.all_reduce(t),
+ lambda t, c=comm: c.should_use_symm_mem(t),
+ nullcontext(),
+ None, # no env variable needed
+ )
+ )
+
+ # Benchmark each communicator
+ for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+ # Set environment variable if needed
+ if env_var is not None:
+ os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+ else:
+ # Clear the environment variable to avoid interference
+ os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+ latency = self.benchmark_allreduce_single(
+ sequence_length,
+ allreduce_fn,
+ should_use_fn,
+ context,
+ num_warmup,
+ num_trials,
+ )
+ if latency is not None:
+ results[name] = latency
+
+ return results
+
+ def benchmark_allreduce_single(
+ self,
+ sequence_length: int,
+ allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
+ should_use_fn: Callable[[torch.Tensor], bool],
+ context,
+ num_warmup: int,
+ num_trials: int,
+ ) -> float | None:
+ """Benchmark method with CUDA graph optimization."""
+ try:
+ # Create test tensor (2D: sequence_length x hidden_size)
+ tensor = torch.randn(
+ sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
+ )
+ if not should_use_fn(tensor):
+ return None
+
+ torch.cuda.synchronize()
+ stream = torch.cuda.Stream()
+ with torch.cuda.stream(stream):
+ graph_input = tensor.clone()
+
+ # Warmup before capture
+ for _ in range(3):
+ allreduce_fn(graph_input)
+
+ # Capture the graph using context manager
+ with context:
+ graph = torch.cuda.CUDAGraph()
+ graph_pool = torch.cuda.graph_pool_handle()
+ set_graph_pool_id(graph_pool)
+ with torch.cuda.graph(graph, pool=graph_pool):
+ for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
+ allreduce_fn(graph_input)
+
+ torch.cuda.synchronize()
+ for _ in range(num_warmup):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ torch.cuda.synchronize()
+ start_time = time.perf_counter()
+
+ for _ in range(num_trials):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ end_time = time.perf_counter()
+
+ # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
+ return (
+ (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
+ )
+
+ except Exception as e:
+ logger.error("CUDA graph benchmark failed: %s", e)
+ raise RuntimeError(
+ f"CUDA graph benchmark failed for communicator: {e}"
+ ) from e
+
+
+def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
+ """Calculate speedup information for a single tensor size."""
+ if not comm_results:
+ return "N/A"
+
+ # Find the fastest communicator
+ fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
+ fastest_time = comm_results[fastest_comm]
+
+ # Calculate speedup vs PyNccl if available
+ if "pynccl" in comm_results:
+ pynccl_time = comm_results["pynccl"]
+ speedup = pynccl_time / fastest_time
+ return f"{fastest_comm} ({speedup:.2f}x)"
+ else:
+ return f"{fastest_comm} (N/A)"
+
+
+def print_results(
+ results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
+):
+ """Print benchmark results in a formatted table."""
+
+ print(f"\n{'=' * 130}")
+ print("Device Communicator Benchmark Results")
+ print(
+ f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
+ f"Hidden Size: {HIDDEN_SIZE}"
+ )
+ print(f"{'=' * 130}")
+
+ # Get all communicator names
+ all_comms = set()
+ for size_results in results.values():
+ all_comms.update(size_results.keys())
+
+ all_comms = sorted(list(all_comms))
+
+ # Print header
+ header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
+ for comm in all_comms:
+ header += f"{comm:<20}"
+ header += f"{'Best (Speedup vs PyNccl)':<30}"
+ print(header)
+ print("-" * len(header))
+
+ # Print results for each sequence length
+ for seq_len in sequence_lengths:
+ if seq_len in results:
+ # Calculate tensor size in elements and bytes
+ tensor_elements = seq_len * HIDDEN_SIZE
+ tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
+
+ # Format tensor size (MB)
+ tensor_size_mb = tensor_bytes / (1024 * 1024)
+ tensor_size_str = f"{tensor_size_mb:.2f} MB"
+
+ # Format tensor shape
+ tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
+
+ row = f"{tensor_shape:<20}{tensor_size_str:<15}"
+ for comm in all_comms:
+ if comm in results[seq_len]:
+ row += f"{results[seq_len][comm]:<20.3f}"
+ else:
+ row += f"{'N/A':<20}"
+
+ # Calculate speedup information
+ speedup_info = _calculate_speedup_info(results[seq_len])
+ row += f"{speedup_info:<30}"
+
+ print(row)
+
+ print(f"{'=' * 130}")
+ print("All times are in milliseconds (ms) per allreduce operation")
+ print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
+
+
+def main():
+ parser = FlexibleArgumentParser(description="Benchmark device communicators")
+
+ parser.add_argument(
+ "--sequence-lengths",
+ type=int,
+ nargs="+",
+ default=DEFAULT_SEQUENCE_LENGTHS,
+ help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
+ )
+
+ parser.add_argument(
+ "--num-warmup", type=int, default=5, help="Number of warmup iterations"
+ )
+
+ parser.add_argument(
+ "--num-trials", type=int, default=50, help="Number of benchmark trials"
+ )
+
+ parser.add_argument("--output-json", type=str, help="Output results to JSON file")
+
+ args = parser.parse_args()
+
+ # Initialize distributed
+ if not dist.is_initialized():
+ dist.init_process_group(backend="gloo")
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+
+ # Set device
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(device)
+
+ # Get CPU process group
+ cpu_group = dist.new_group(backend="gloo")
+
+ # Disable USE_SYMM_MEM to avoid affecting the max_sizes
+ # in symm_mem and custom_all_reduce for benchmark
+ os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+ # Initialize benchmark
+ benchmark = CommunicatorBenchmark(
+ rank, world_size, device, cpu_group, args.sequence_lengths
+ )
+
+ # Run benchmarks
+ all_results = {}
+
+ for seq_len in args.sequence_lengths:
+ if rank == 0:
+ logger.info(
+ "Benchmarking sequence length: %s (tensor shape: %s x %s)",
+ seq_len,
+ seq_len,
+ HIDDEN_SIZE,
+ )
+
+ results = benchmark.benchmark_allreduce(
+ sequence_length=seq_len,
+ num_warmup=args.num_warmup,
+ num_trials=args.num_trials,
+ )
+
+ all_results[seq_len] = results
+
+ # Synchronize between ranks
+ dist.barrier()
+
+ # Print results (only rank 0)
+ if rank == 0:
+ print_results(all_results, args.sequence_lengths, world_size)
+
+ # Save to JSON if requested
+ if args.output_json:
+ # Add speedup information to results
+ enhanced_results = {}
+ for seq_len, comm_results in all_results.items():
+ enhanced_results[seq_len] = {
+ "timings": comm_results,
+ "speedup_info": _calculate_speedup_info(comm_results),
+ }
+
+ output_data = {
+ "world_size": world_size,
+ "dtype": str(BENCHMARK_DTYPE),
+ "hidden_size": HIDDEN_SIZE,
+ "sequence_lengths": args.sequence_lengths,
+ "num_warmup": args.num_warmup,
+ "num_trials": args.num_trials,
+ "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
+ "results": enhanced_results,
+ }
+
+ with open(args.output_json, "w") as f:
+ json.dump(output_data, f, indent=2)
+
+ logger.info("Results saved to %s", args.output_json)
+
+ # Cleanup
+ if cpu_group != dist.group.WORLD:
+ dist.destroy_process_group(cpu_group)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index a6b42406b5cb..14330ae6f03c 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -7,6 +7,7 @@
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
@@ -96,6 +97,11 @@ def run_triton_moe(
a_scale: torch.Tensor,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_scale,
+ )
for _ in range(num_repeats):
fused_experts(
a,
@@ -103,10 +109,7 @@ def run_triton_moe(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_scale,
+ quant_config=quant_config,
)
def run_cutlass_moe(
@@ -125,6 +128,12 @@ def run_cutlass_moe(
per_act_token: bool,
num_repeats: int,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ per_act_token_quant=per_act_token,
+ )
+
for _ in range(num_repeats):
cutlass_moe_fp8(
a,
@@ -132,14 +141,11 @@ def run_cutlass_moe(
w2,
topk_weights,
topk_ids,
- w1_scale,
- w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
- per_act_token,
- a1_scale=None,
+ quant_config=quant_config,
)
def run_cutlass_from_graph(
@@ -156,6 +162,12 @@ def run_cutlass_from_graph(
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ per_act_token_quant=per_act_token,
+ )
+
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@@ -165,14 +177,11 @@ def run_cutlass_from_graph(
w2_q,
topk_weights,
topk_ids,
- w1_scale,
- w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
- per_act_token,
- a1_scale=None,
+ quant_config=quant_config,
)
def run_triton_from_graph(
@@ -185,6 +194,11 @@ def run_triton_from_graph(
w2_scale: torch.Tensor,
a_scale: torch.Tensor,
):
+ quant_config = fp8_w8a8_moe_quant_config(
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a_scale,
+ )
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
):
@@ -194,10 +208,7 @@ def run_triton_from_graph(
w2,
topk_weights,
topk_ids,
- use_fp8_w8a8=True,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a_scale,
+ quant_config=quant_config,
)
def replay_graph(graph, num_repeats):
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 69978ec6b23e..bcfa64c3f425 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -7,7 +7,8 @@
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 89309c79f099..39338f338761 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -6,11 +6,12 @@
import json
import pickle
import time
+from collections.abc import Callable
from dataclasses import dataclass
from enum import Enum, auto
from itertools import product
from pathlib import Path
-from typing import Any, Callable, Optional
+from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
@@ -79,9 +80,9 @@ def make_rand_lora_weight_tensor(
def make_rand_tensors(
- a_shape: tuple[int],
- b_shape: tuple[int],
- c_shape: tuple[int],
+ a_shape: tuple[int, ...],
+ b_shape: tuple[int, ...],
+ c_shape: tuple[int, ...],
a_dtype: torch.dtype,
b_dtype: torch.dtype,
c_dtype: torch.dtype,
@@ -158,7 +159,7 @@ def ref_group_gemm(
seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor,
scaling: float,
- add_inputs: Optional[bool],
+ add_inputs: bool | None,
):
"""
Torch group gemm reference implementation to test correctness of
@@ -243,7 +244,7 @@ def matmul_shapes(
lora_rank: int,
num_loras: int,
num_slices: int,
- ) -> tuple[tuple[int], tuple[int], tuple[int]]:
+ ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
"""
Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type
@@ -316,8 +317,8 @@ class BenchmarkContext:
lora_rank: int
sort_by_lora_id: bool
dtype: torch.dtype
- seq_length: Optional[int] = None
- num_slices: Optional[int] = None # num_slices for slice based ops
+ seq_length: int | None = None
+ num_slices: int | None = None # num_slices for slice based ops
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
ctx = copy.copy(self)
@@ -464,7 +465,11 @@ def to_device(tensor: torch.Tensor):
for field_name in LoRAKernelMeta.__dataclass_fields__:
field = getattr(self.lora_kernel_meta, field_name)
assert isinstance(field, torch.Tensor)
- setattr(self.lora_kernel_meta, field_name, to_device(field))
+ setattr(
+ self.lora_kernel_meta,
+ field_name,
+ to_device(field) if field_name != "no_lora_flag_cpu" else field,
+ )
def metadata(self) -> tuple[int, int, int]:
"""
@@ -512,6 +517,7 @@ def as_lora_shrink_kwargs(self) -> dict[str, Any]:
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"scaling": 1.0,
+ "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
@@ -552,10 +558,11 @@ def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
"lora_ids": self.lora_kernel_meta.active_lora_ids,
"offset_start": 0,
"add_inputs": add_inputs,
+ "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
}
def bench_fn_kwargs(
- self, op_type: OpType, add_inputs: Optional[bool] = None
+ self, op_type: OpType, add_inputs: bool | None = None
) -> dict[str, Any]:
if op_type.is_shrink_fn():
assert add_inputs is None
@@ -569,7 +576,7 @@ def bench_fn_kwargs(
raise ValueError(f"Unrecognized optype {self}")
def test_correctness(
- self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
+ self, op_type: OpType, expand_fn_add_inputs: bool | None
) -> bool:
"""
Test correctness of op_type implementation against a grouped gemm
@@ -605,8 +612,8 @@ def bench_optype(
ctx: BenchmarkContext,
arg_pool_size: int,
op_type: OpType,
- cuda_graph_nops: Optional[int] = None,
- expand_fn_add_inputs: Optional[bool] = None,
+ cuda_graph_nops: int | None = None,
+ expand_fn_add_inputs: bool | None = None,
test_correctness: bool = False,
) -> TMeasurement:
assert arg_pool_size >= 1
@@ -673,7 +680,7 @@ def bench_torch_mm(
ctx: BenchmarkContext,
arg_pool_size: int,
op_type: OpType,
- cuda_graph_nops: Optional[int] = None,
+ cuda_graph_nops: int | None = None,
) -> TMeasurement:
"""
Benchmark basic torch.mm as a roofline.
@@ -738,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
"""
-def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
+def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
compare = TBenchmark.Compare(timers)
compare.print()
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 1b1c3b321cce..e1d5239f5cc9 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -8,10 +8,9 @@
import os
import pickle as pkl
import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
from dataclasses import dataclass
from itertools import product
-from typing import Callable, Optional
import pandas as pd
import torch
@@ -63,23 +62,23 @@ class BenchmarkTensors:
a: torch.Tensor
w_q: torch.Tensor
- group_size: Optional[int]
+ group_size: int | None
wtype: ScalarType
w_g_s: torch.Tensor
- w_g_zp: Optional[torch.Tensor]
- w_ch_s: Optional[torch.Tensor]
- w_tok_s: Optional[torch.Tensor]
+ w_g_zp: torch.Tensor | None
+ w_ch_s: torch.Tensor | None
+ w_tok_s: torch.Tensor | None
@dataclass
class TypeConfig:
act_type: torch.dtype
weight_type: ScalarType
- output_type: Optional[torch.dtype]
- group_scale_type: Optional[torch.dtype]
- group_zero_type: Optional[torch.dtype]
- channel_scale_type: Optional[torch.dtype]
- token_scale_type: Optional[torch.dtype]
+ output_type: torch.dtype | None
+ group_scale_type: torch.dtype | None
+ group_zero_type: torch.dtype | None
+ channel_scale_type: torch.dtype | None
+ token_scale_type: torch.dtype | None
def rand_data(shape, dtype=torch.float16, scale=1):
@@ -93,8 +92,8 @@ def quantize_and_pack(
atype: torch.dtype,
w: torch.Tensor,
wtype: ScalarType,
- stype: Optional[torch.dtype],
- group_size: Optional[int],
+ stype: torch.dtype | None,
+ group_size: int | None,
zero_points: bool = False,
):
assert wtype.is_integer(), "TODO: support floating point weights"
@@ -113,7 +112,7 @@ def quantize_and_pack(
def create_bench_tensors(
- shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+ shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
) -> list[BenchmarkTensors]:
m, n, k = shape
@@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
return res
-_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
-_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
+_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
def bench(
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 6259aa0dd629..9298d3b58dfb 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -14,6 +14,10 @@
import torch
from ray.experimental.tqdm_ray import tqdm
+from vllm.model_executor.layers.fused_moe.config import (
+ FusedMoEQuantConfig,
+ _get_config_dtype_str,
+)
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
@@ -134,43 +138,36 @@ def prepare(i: int):
def run():
from vllm.model_executor.layers.fused_moe import override_config
+ if use_fp8_w8a8:
+ quant_dtype = torch.float8_e4m3fn
+ elif use_int8_w8a16:
+ quant_dtype = torch.int8
+ else:
+ quant_dtype = None
+
+ quant_config = FusedMoEQuantConfig.make(
+ quant_dtype=quant_dtype,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ block_shape=block_quant_shape,
+ )
+
with override_config(config):
- if use_deep_gemm:
- topk_weights, topk_ids, token_expert_indices = fused_topk(
- x, input_gating, topk, False
- )
- return fused_experts(
- x,
- w1,
- w2,
- topk_weights,
- topk_ids,
- inplace=True,
- use_fp8_w8a8=use_fp8_w8a8,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- block_shape=block_quant_shape,
- allow_deep_gemm=True,
- )
- else:
- fused_moe(
- x,
- w1,
- w2,
- input_gating,
- topk,
- renormalize=True,
- inplace=True,
- use_fp8_w8a8=use_fp8_w8a8,
- use_int8_w8a16=use_int8_w8a16,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- block_shape=block_quant_shape,
- )
+ topk_weights, topk_ids, token_expert_indices = fused_topk(
+ x, input_gating, topk, renormalize=not use_deep_gemm
+ )
+ return fused_experts(
+ x,
+ w1,
+ w2,
+ topk_weights,
+ topk_ids,
+ inplace=True,
+ quant_config=quant_config,
+ allow_deep_gemm=use_deep_gemm,
+ )
# JIT compilation & warmup
run()
@@ -414,7 +411,7 @@ def benchmark(
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
- dtype_str = get_config_dtype_str(
+ dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -547,7 +544,7 @@ def save_configs(
block_quant_shape: list[int],
save_dir: str,
) -> None:
- dtype_str = get_config_dtype_str(
+ dtype_str = _get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
)
@@ -560,7 +557,7 @@ def save_configs(
filename = os.path.join(save_dir, filename)
print(f"Writing best config to {filename}...")
with open(filename, "w") as f:
- json.dump(configs, f, indent=4)
+ json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
f.write("\n")
@@ -582,26 +579,42 @@ def main(args: argparse.Namespace):
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
+ hidden_size = config.hidden_size
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
+ hidden_size = config.hidden_size
elif config.architectures[0] in (
- "DeepseekV3ForCausalLM",
"DeepseekV2ForCausalLM",
+ "DeepseekV3ForCausalLM",
+ "DeepseekV32ForCausalLM",
"Glm4MoeForCausalLM",
):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
- elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+ hidden_size = config.hidden_size
+ elif config.architectures[0] in (
+ "Qwen2MoeForCausalLM",
+ "Qwen3MoeForCausalLM",
+ "Qwen3NextForCausalLM",
+ ):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
+ hidden_size = config.hidden_size
+ elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
+ text_config = config.get_text_config()
+ E = text_config.num_experts
+ topk = text_config.num_experts_per_tok
+ intermediate_size = text_config.moe_intermediate_size
+ hidden_size = text_config.hidden_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
+ hidden_size = config.hidden_size
else:
# Support for llama4
config = config.get_text_config()
@@ -609,6 +622,7 @@ def main(args: argparse.Namespace):
E = config.num_local_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
+ hidden_size = config.hidden_size
enable_ep = bool(args.enable_expert_parallel)
if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts")
@@ -617,8 +631,7 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
- hidden_size = config.hidden_size
- dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+ dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config)
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 04d2205aa372..459eafa6d907 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
hidden_size = config.hidden_size
- dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+ dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 7e0376c18ecc..1b1e71adeec4 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -3,16 +3,15 @@
import random
import time
-from typing import Optional
import torch
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
- FlexibleArgumentParser,
create_kv_caches_with_random,
)
@@ -37,7 +36,7 @@ def main(
seed: int,
do_profile: bool,
device: str = "cuda",
- kv_cache_dtype: Optional[str] = None,
+ kv_cache_dtype: str | None = None,
) -> None:
current_platform.seed_everything(seed)
diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py
index 1ccb5e08b3d5..bdc1eb733084 100644
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -3,8 +3,8 @@
import argparse
import math
+from collections.abc import Callable
from contextlib import contextmanager
-from typing import Callable
from unittest.mock import patch
import torch
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 6ab26f5f1adf..61427a77b4e3 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -7,7 +7,8 @@
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py
new file mode 100644
index 000000000000..e0ff09d4b397
--- /dev/null
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import time
+
+import torch
+from tabulate import tabulate
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
+ STR_DTYPE_TO_TORCH_DTYPE,
+ create_kv_caches_with_random,
+)
+
+logger = init_logger(__name__)
+
+
+@torch.inference_mode()
+def run_benchmark(
+ num_tokens: int,
+ num_heads: int,
+ head_size: int,
+ block_size: int,
+ num_blocks: int,
+ dtype: torch.dtype,
+ kv_cache_dtype: str,
+ num_iters: int,
+ benchmark_mode: str,
+ device: str = "cuda",
+) -> float:
+ """Return latency (seconds) for given num_tokens."""
+
+ if kv_cache_dtype == "fp8" and head_size % 16:
+ raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
+
+ current_platform.seed_everything(42)
+ torch.set_default_device(device)
+
+ # create random key / value tensors [T, H, D].
+ key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
+ value = torch.randn_like(key)
+
+ # prepare the slot mapping.
+ # each token is assigned a unique slot in the KV-cache.
+ num_slots = block_size * num_blocks
+ if num_tokens > num_slots:
+ raise ValueError("num_tokens cannot exceed the total number of cache slots")
+ slot_mapping_lst = random.sample(range(num_slots), num_tokens)
+ slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
+
+ key_caches, value_caches = create_kv_caches_with_random(
+ num_blocks,
+ block_size,
+ 1, # num_layers
+ num_heads,
+ head_size,
+ kv_cache_dtype,
+ dtype,
+ device=device,
+ )
+ key_cache, value_cache = key_caches[0], value_caches[0]
+ # to free unused memory
+ del key_caches, value_caches
+
+ # compute per-kernel scaling factors for fp8 conversion (if used).
+ k_scale = (key.amax() / 64.0).to(torch.float32)
+ v_scale = (value.amax() / 64.0).to(torch.float32)
+
+ function_under_test = lambda: ops.reshape_and_cache(
+ key, # noqa: F821
+ value, # noqa: F821
+ key_cache, # noqa: F821
+ value_cache, # noqa: F821
+ slot_mapping, # noqa: F821
+ kv_cache_dtype,
+ k_scale,
+ v_scale,
+ )
+
+ if benchmark_mode == "cudagraph":
+ g = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(g):
+ function_under_test()
+ torch.cuda.synchronize()
+ function_under_test = lambda: g.replay()
+
+ def run_cuda_benchmark(n_iters: int) -> float:
+ nonlocal key, value, key_cache, value_cache, slot_mapping
+ torch.cuda.synchronize()
+ start = time.perf_counter()
+ for _ in range(n_iters):
+ function_under_test()
+ torch.cuda.synchronize()
+ end = time.perf_counter()
+ return (end - start) / n_iters
+
+ # warm-up
+ run_cuda_benchmark(3)
+
+ lat = run_cuda_benchmark(num_iters)
+
+ # free tensors to mitigate OOM when sweeping
+ del key, value, key_cache, value_cache, slot_mapping
+ torch.cuda.empty_cache()
+
+ return lat
+
+
+def main(args):
+ rows = []
+ for exp in range(1, 17):
+ n_tok = 2**exp
+ lat = run_benchmark(
+ num_tokens=n_tok,
+ num_heads=args.num_heads,
+ head_size=args.head_size,
+ block_size=args.block_size,
+ num_blocks=args.num_blocks,
+ dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+ kv_cache_dtype=args.kv_cache_dtype,
+ num_iters=args.iters,
+ benchmark_mode=args.mode,
+ device="cuda",
+ )
+ rows.append([n_tok, lat * 1e6]) # convert to microseconds
+
+ print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
+ print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser()
+
+ parser.add_argument("--num-heads", type=int, default=128)
+ parser.add_argument(
+ "--head-size",
+ type=int,
+ choices=[64, 80, 96, 112, 120, 128, 192, 256],
+ default=128,
+ )
+ parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+ parser.add_argument("--num-blocks", type=int, default=128 * 128)
+
+ parser.add_argument(
+ "--dtype",
+ type=str,
+ choices=["half", "bfloat16", "float"],
+ default="bfloat16",
+ )
+
+ parser.add_argument(
+ "--kv-cache-dtype",
+ type=str,
+ choices=["auto", "fp8"],
+ default="auto",
+ )
+
+ parser.add_argument("--iters", type=int, default=200)
+
+ parser.add_argument(
+ "--mode",
+ type=str,
+ choices=["cudagraph", "no_graph"],
+ default="cudagraph",
+ )
+
+ args = parser.parse_args()
+
+ main(args)
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
index d4648c18f31d..29f1b2ccdcf6 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
import random
import time
@@ -9,11 +7,14 @@
from tabulate import tabulate
from vllm import _custom_ops as ops
+from vllm.attention.ops.triton_reshape_and_cache_flash import (
+ triton_reshape_and_cache_flash,
+)
from vllm.logger import init_logger
from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
- FlexibleArgumentParser,
create_kv_caches_with_random_flash,
)
@@ -31,6 +32,8 @@ def run_benchmark(
kv_cache_dtype: str,
kv_cache_layout: str,
num_iters: int,
+ implementation: str,
+ benchmark_mode: str,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
@@ -38,6 +41,14 @@ def run_benchmark(
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
+ if implementation not in ("cuda", "triton"):
+ raise ValueError(
+ f"Unsupported implementation: {implementation}. "
+ "Only 'cuda' and 'triton' are supported."
+ )
+ if implementation == "triton" and kv_cache_layout == "HND":
+ return float("nan") # Triton does not support HND layout yet.
+
current_platform.seed_everything(42)
torch.set_default_device(device)
@@ -65,27 +76,49 @@ def run_benchmark(
cache_layout=kv_cache_layout,
)
key_cache, value_cache = key_caches[0], value_caches[0]
+ # to free unused memory
+ del key_caches, value_caches
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
+ if implementation == "cuda":
+ function_under_test = lambda: ops.reshape_and_cache_flash(
+ key, # noqa: F821
+ value, # noqa: F821
+ key_cache, # noqa: F821
+ value_cache, # noqa: F821
+ slot_mapping, # noqa: F821
+ kv_cache_dtype,
+ k_scale,
+ v_scale,
+ )
+ else:
+ function_under_test = lambda: triton_reshape_and_cache_flash(
+ key, # noqa: F821
+ value, # noqa: F821
+ key_cache, # noqa: F821
+ value_cache, # noqa: F821
+ slot_mapping, # noqa: F821
+ kv_cache_dtype,
+ k_scale,
+ v_scale,
+ )
+ if benchmark_mode == "cudagraph":
+ g = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(g):
+ function_under_test()
+ torch.cuda.synchronize()
+ function_under_test = lambda: g.replay()
+
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
- ops.reshape_and_cache_flash(
- key,
- value,
- key_cache,
- value_cache,
- slot_mapping,
- kv_cache_dtype,
- k_scale,
- v_scale,
- )
- torch.cuda.synchronize()
+ function_under_test()
+ torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
@@ -116,10 +149,16 @@ def main(args):
kv_cache_dtype=args.kv_cache_dtype,
kv_cache_layout=layout,
num_iters=args.iters,
+ implementation=args.implementation,
+ benchmark_mode=args.mode,
device="cuda",
)
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
+ print(
+ f"Benchmark results for implementation {args.implementation}"
+ f" (measuring with {args.mode}):"
+ )
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
@@ -151,6 +190,21 @@ def main(args):
)
parser.add_argument("--iters", type=int, default=100)
+
+ parser.add_argument(
+ "--implementation",
+ type=str,
+ choices=["cuda", "triton"],
+ default="cuda",
+ )
+
+ parser.add_argument(
+ "--mode",
+ type=str,
+ choices=["cudagraph", "no_graph"],
+ default="cudagraph",
+ )
+
args = parser.parse_args()
main(args)
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index 4cf633a81358..d8d7f5bcf9da 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
-from typing import Optional, Union
import torch
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@@ -21,8 +20,8 @@ def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
def forward(
self,
x: torch.Tensor,
- residual: Optional[torch.Tensor] = None,
- ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+ residual: torch.Tensor | None = None,
+ ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
orig_dtype = x.dtype
x = x.to(torch.float32)
if residual is not None:
@@ -41,7 +40,7 @@ def forward(
def rmsnorm_naive(
x: torch.Tensor,
weight: torch.Tensor,
- residual: Optional[torch.Tensor] = None,
+ residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@@ -65,7 +64,7 @@ def rmsnorm_naive(
def rmsnorm_flashinfer(
x: torch.Tensor,
weight: torch.Tensor,
- residual: Optional[torch.Tensor] = None,
+ residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
orig_shape = x.shape
@@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
def rmsnorm_vllm(
x: torch.Tensor,
weight: torch.Tensor,
- residual: Optional[torch.Tensor] = None,
+ residual: torch.Tensor | None = None,
eps: float = 1e-6,
):
orig_shape = x.shape
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index b81baf17a8c6..24869c91a8d7 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate
-from typing import Optional
import nvtx
import torch
@@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
seq_len: int,
num_heads: int,
head_size: int,
- rotary_dim: Optional[int],
+ rotary_dim: int | None,
dtype: torch.dtype,
seed: int,
device: str,
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index 0650cbf3cc18..a5887aafd30d 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -1,77 +1,720 @@
-#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
+"""
+Comprehensive 3-way SiLU Benchmark Suite
+
+This benchmark compares three SiLU implementations:
+1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
+2. Triton Kernel - Triton-based implementation
+
+The suite generates detailed performance comparisons including:
+- Memory bandwidth utilization
+- Speedup ratios (baseline vs optimized implementations)
+- Performance across different expert configurations and token distributions
+"""
+
+from collections.abc import Callable
+
+import matplotlib.pyplot as plt
+import numpy as np
import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
- silu_mul_fp8_quant_deep_gemm,
+ persistent_masked_m_silu_mul_quant,
)
from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+
+@triton.jit
+def _silu_mul_fp8_quant_deep_gemm(
+ # Pointers ------------------------------------------------------------
+ input_ptr, # 16-bit activations (E, T, 2*H)
+ y_q_ptr, # fp8 quantized activations (E, T, H)
+ y_s_ptr, # 16-bit scales (E, T, G)
+ counts_ptr, # int32 num tokens per expert (E)
+ # Sizes ---------------------------------------------------------------
+ H: tl.constexpr, # hidden dimension (per output)
+ GROUP_SIZE: tl.constexpr, # elements per group (usually 128)
+ # Strides for input (elements) ---------------------------------------
+ stride_i_e,
+ stride_i_t,
+ stride_i_h,
+ # Strides for y_q (elements) -----------------------------------------
+ stride_yq_e,
+ stride_yq_t,
+ stride_yq_h,
+ # Strides for y_s (elements) -----------------------------------------
+ stride_ys_e,
+ stride_ys_t,
+ stride_ys_g,
+ # Stride for counts (elements)
+ stride_counts_e,
+ # Numeric params ------------------------------------------------------
+ eps: tl.constexpr,
+ fp8_min: tl.constexpr,
+ fp8_max: tl.constexpr,
+ use_ue8m0: tl.constexpr,
+ # Meta ---------------------------------------------------------------
+ BLOCK: tl.constexpr,
+ NUM_STAGES: tl.constexpr,
+):
+ G = H // GROUP_SIZE
+
+ # map program id -> (e, g)
+ pid = tl.program_id(0)
+ e = pid // G
+ g = pid % G
+
+ e = e.to(tl.int64)
+ g = g.to(tl.int64)
+
+ # number of valid tokens for this expert
+ n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
+
+ cols = tl.arange(0, BLOCK).to(tl.int64)
+ mask = cols < BLOCK
+
+ base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
+ base_gate_offset = base_input_offset + cols * stride_i_h
+ base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
+ base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
+ base_ys_offset = e * stride_ys_e + g * stride_ys_g
+
+ for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
+ gate = tl.load(
+ input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
+ ).to(tl.float32)
+ up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
+
+ gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
+ y = gate * up
+
+ y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
+ if use_ue8m0:
+ y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
+ y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
-def benchmark(E, T, H, G=128, runs=50):
- current_platform.seed_everything(42)
- y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
- tokens_per_expert = torch.randint(
- T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
+ tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
+ tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
+
+
+def silu_mul_fp8_quant_deep_gemm_triton(
+ y: torch.Tensor, # (E, T, 2*H)
+ tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert
+ num_parallel_tokens,
+ group_size: int = 128,
+ eps: float = 1e-10,
+ expert_offsets: torch.Tensor = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+ """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
+
+ y has shape (E, T, 2*H). The first half of the last dimension is
+ silu-activated, multiplied by the second half, then quantized into FP8.
+
+ Returns `(y_q, y_s)` where
+ * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
+ * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+ """
+ assert y.ndim == 3, "y must be (E, T, 2*H)"
+ E, T, H2 = y.shape
+ assert H2 % 2 == 0, "last dim of y must be even (2*H)"
+ H = H2 // 2
+ G = (H + group_size - 1) // group_size
+ assert H % group_size == 0, "H must be divisible by group_size"
+ assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
+ "tokens_per_expert must be shape (E,)"
+ )
+ tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
+
+ # allocate outputs
+ fp8_dtype = torch.float8_e4m3fn
+ y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+
+ # strides (elements)
+ stride_i_e, stride_i_t, stride_i_h = y.stride()
+ stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+
+ # desired scale strides (elements): (T*G, 1, T)
+ stride_ys_e = T * G
+ stride_ys_t = 1
+ stride_ys_g = T
+ y_s = torch.empty_strided(
+ (E, T, G),
+ (stride_ys_e, stride_ys_t, stride_ys_g),
+ dtype=torch.float32,
+ device=y.device,
)
+ stride_cnt_e = tokens_per_expert.stride()[0]
+
+ # Static grid over experts and H-groups.
+ # A loop inside the kernel handles the token dim
+ grid = (E * G,)
+
+ f_info = torch.finfo(fp8_dtype)
+ fp8_max = f_info.max
+ fp8_min = f_info.min
+
+ _silu_mul_fp8_quant_deep_gemm[grid](
+ y,
+ y_q,
+ y_s,
+ tokens_per_expert,
+ H,
+ group_size,
+ stride_i_e,
+ stride_i_t,
+ stride_i_h,
+ stride_yq_e,
+ stride_yq_t,
+ stride_yq_h,
+ stride_ys_e,
+ stride_ys_t,
+ stride_ys_g,
+ stride_cnt_e,
+ eps,
+ fp8_min,
+ fp8_max,
+ is_deep_gemm_e8m0_used(),
+ BLOCK=group_size,
+ NUM_STAGES=4,
+ num_warps=1,
+ )
+
+ return y_q, y_s
+
+
+# Parse generation strategies
+strategies = ["random_imbalanced", "uniform", "max_t"]
+
+
+def benchmark(
+ kernel: Callable,
+ E: int,
+ T: int,
+ H: int,
+ total_tokens: int,
+ num_parallel_tokens: int = 64,
+ G: int = 128,
+ runs: int = 200,
+ num_warmups: int = 20,
+ gen_strategy: str = "default",
+ iterations_per_run: int = 20,
+):
+ def generate_data(seed_offset=0):
+ """Generate input data with given seed offset"""
+ current_platform.seed_everything(42 + seed_offset)
+ y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
+
+ if gen_strategy == "random_imbalanced":
+
+ def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
+ mean = total_tokens // n_e
+ min_max = mean // ratio
+ e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
+ e[0] = min_max
+ r = torch.rand(size=(E - 1,))
+ r /= r.sum()
+ r *= total_tokens - min_max
+ r = r.round().long()
+ e[1:] = r.to(device=device)
+ return e
+
+ tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
+ elif gen_strategy == "uniform":
+ r = torch.rand(size=(E,))
+ r /= r.sum()
+ r *= total_tokens
+ r = r.round().long()
+ tokens_per_expert = r
+ elif gen_strategy == "max_t":
+ tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
+ tokens_per_expert.fill_(total_tokens / E)
+ elif gen_strategy == "first_t":
+ tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
+ tokens_per_expert[0] = min(T, total_tokens)
+ else:
+ raise ValueError(f"Unknown generation strategy: {gen_strategy}")
+ return y, tokens_per_expert
+
+ dataset_count = 4
+ # Pre-generate different input matrices for each iteration to avoid cache effects
+ data_sets = [generate_data(i) for i in range(dataset_count)]
+
# Warmup
- for _ in range(10):
- silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
- torch.cuda.synchronize()
+ y, tokens_per_expert = data_sets[0]
+ for _ in range(num_warmups):
+ kernel(
+ y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
+ )
+ torch.cuda.synchronize()
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
# Benchmark
- torch.cuda.synchronize()
- start = time.perf_counter()
+ latencies: list[float] = []
for _ in range(runs):
- silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
- torch.cuda.synchronize()
+ torch.cuda.synchronize()
- avg_time = (time.perf_counter() - start) / runs * 1000
+ start_event.record()
+ for i in range(iterations_per_run):
+ y, tokens_per_expert = data_sets[i % dataset_count]
+ kernel(
+ y,
+ tokens_per_expert,
+ num_parallel_tokens=num_parallel_tokens,
+ group_size=G,
+ )
+ end_event.record()
+ end_event.synchronize()
- # Calculate actual work done (only count valid tokens)
+ total_time_ms = start_event.elapsed_time(end_event)
+ per_iter_time_ms = total_time_ms / iterations_per_run
+ latencies.append(per_iter_time_ms)
+
+ # Use median instead of average for better outlier handling
+ median_time_ms = np.median(latencies)
+ median_time_s = median_time_ms / 1000
+
+ # Calculate actual work done (using first dataset for consistency)
+ _, tokens_per_expert = data_sets[0]
actual_tokens = tokens_per_expert.sum().item()
actual_elements = actual_tokens * H
# GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
ops_per_element = 8
total_ops = actual_elements * ops_per_element
- gflops = total_ops / (avg_time / 1000) / 1e9
+ gflops = total_ops / median_time_s / 1e9
# Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs
output_bytes = actual_tokens * H * 1 # H fp8 outputs
scale_bytes = actual_tokens * (H // G) * 4 # scales in float32
total_bytes = input_bytes + output_bytes + scale_bytes
- memory_bw = total_bytes / (avg_time / 1000) / 1e9
+ memory_bw = total_bytes / median_time_s / 1e9
+
+ HOPPER_BANDWIDTH_TBPS = 3.35
+ return (
+ median_time_ms,
+ gflops,
+ memory_bw,
+ (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
+ )
+
+
+def create_comparison_plot(
+ ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
+):
+ fig, ax = plt.subplots(1, 1, figsize=(18, 6))
+
+ # Configure x-axis positions
+ x = np.arange(len(config_labels))
+ width = 0.25
+
+ # Execution Time plot (lower is better)
+ ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
+ ax.bar(
+ x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
+ )
+
+ # Add speedup labels over each bar trio
+ for i in range(len(x)):
+ triton_v2_speedup = ratios[i][1] # triton/v2
+ max_height = max(silu_v2_times[i], triton_times[i])
+
+ # Triton/V2 speedup
+ ax.text(
+ x[i] + width / 2,
+ max_height + max_height * 0.02,
+ f"{triton_v2_speedup:.2f}x",
+ ha="center",
+ va="bottom",
+ fontweight="bold",
+ fontsize=8,
+ )
+
+ ax.set_xlabel("Configuration")
+ ax.set_ylabel("% Utilization")
+ ax.set_title(
+ f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(config_labels, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ return fig, ax
- return avg_time, gflops, memory_bw
+def create_combined_plot(all_results):
+ num_strategies = len(all_results)
+ fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
+ if num_strategies == 1:
+ axes = [axes]
+
+ for idx, (
+ strategy_name,
+ all_ratios,
+ all_silu_v2_results,
+ all_triton_results,
+ config_labels,
+ config_x_axis,
+ ) in enumerate(all_results):
+ ax = axes[idx]
+
+ # Flatten the nested results to get bandwidth percentages for plotting
+ silu_v2_bandwidths = []
+ triton_bandwidths = []
+ flat_ratios = []
+
+ for config_results in all_silu_v2_results:
+ for result in config_results:
+ silu_v2_bandwidths.append(result[3]) # bandwidth percentage
+
+ for config_results in all_triton_results:
+ for result in config_results:
+ triton_bandwidths.append(result[3]) # bandwidth percentage
+
+ for config_ratios in all_ratios:
+ for ratio in config_ratios:
+ flat_ratios.append(ratio)
+
+ # Configure x-axis positions
+ x = np.arange(len(config_labels))
+ width = 0.25
+
+ # Bandwidth utilization plot (higher is better)
+ ax.bar(
+ x,
+ silu_v2_bandwidths,
+ width,
+ label="SiLU V2 (CUDA)",
+ alpha=0.8,
+ color="blue",
+ )
+ ax.bar(
+ x + width,
+ triton_bandwidths,
+ width,
+ label="Triton Kernel",
+ alpha=0.8,
+ color="green",
+ )
+
+ # Add speedup labels over each bar trio
+ for i in range(len(x)):
+ triton_v2_speedup = flat_ratios[i] # triton/v2
+ max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
+
+ # Triton/V2 speedup
+ ax.text(
+ x[i] + width / 2,
+ max_height + max_height * 0.02,
+ f"{triton_v2_speedup:.2f}x",
+ ha="center",
+ va="bottom",
+ fontweight="bold",
+ fontsize=8,
+ )
+
+ ax.set_xlabel("Configuration")
+ ax.set_ylabel("% Utilization")
+ ax.set_title(
+ f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+ )
+ ax.set_xticks(x)
+ ax.set_xticklabels(config_labels, rotation=45, ha="right")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ plt.tight_layout()
+ filename = "silu_benchmark_combined_3way.png"
+ plt.savefig(filename, dpi=300, bbox_inches="tight")
+ plt.show()
+
+ return filename
+
+
+outer_dim = 7168
configs = [
- (8, 32, 1024),
- (16, 64, 2048),
- (32, 128, 4096),
# DeepSeekV3 Configs
- (256, 16, 7168),
- (256, 32, 7168),
- (256, 64, 7168),
- (256, 128, 7168),
- (256, 256, 7168),
- (256, 512, 7168),
+ # (1, 56, 7168),
+ (8, 1024, 7168),
+ # (32, 56, 7168),
+ # DeepSeekV3 Configs
+ (32, 1024, 7168),
+ # DeepSeekV3 Configs
(256, 1024, 7168),
]
+runs = 100
+num_warmups = 20
+
+strategy_descriptions = {
+ "uniform": "Uniform Random",
+ "random_imbalanced": "Imbalanced Random",
+ "max_t": "Even Assignment",
+ "first_t": "experts[0] = T, experts[1:] = 0",
+}
+
print(f"GPU: {torch.cuda.get_device_name()}")
-print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
-print("-" * 50)
-
-for E, T, H in configs:
- try:
- time_ms, gflops, gbps = benchmark(E, T, H)
- print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
- except Exception:
- print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
+print(f"Testing strategies: {', '.join(strategies)}")
+print(f"Configurations: {len(configs)} configs")
+
+all_results = []
+
+# Run benchmarks for each strategy
+for id, strategy in enumerate(strategies):
+ print(f"\n{'=' * 60}")
+ print(f"Testing strategy: {strategy_descriptions[strategy]}")
+ print(f"{'=' * 60}")
+
+ # Collect benchmark data for all three algorithms
+ config_labels = []
+ config_x_axis = []
+ all_silu_v2_results = []
+ all_triton_results = []
+ all_ratios = []
+
+ for E, T, H in configs:
+ total_tokens_config = []
+ for i in [8, 16, 32, 64, 128, 256, 512]:
+ if i <= T:
+ total_tokens_config.append(i * E)
+ config_x_axis.append(total_tokens_config)
+
+ silu_v2_results = []
+ triton_results = []
+ ratios = []
+
+ for total_tokens in total_tokens_config:
+ config_label = f"E={E},T={T},H={H},TT={total_tokens}"
+ config_labels.append(config_label)
+
+ # SiLU V2 (CUDA kernel) results
+ time_ms_silu_v2, gflops, gbps, perc = benchmark(
+ persistent_masked_m_silu_mul_quant,
+ E,
+ T,
+ H,
+ total_tokens,
+ runs=runs,
+ num_warmups=num_warmups,
+ gen_strategy=strategy,
+ )
+ silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
+
+ # Triton kernel results
+ time_ms_triton, gflops, gbps, perc = benchmark(
+ silu_mul_fp8_quant_deep_gemm_triton,
+ E,
+ T,
+ H,
+ total_tokens,
+ runs=runs,
+ num_warmups=num_warmups,
+ gen_strategy=strategy,
+ )
+ triton_results.append((time_ms_triton, gflops, gbps, perc))
+
+ # Calculate speedup ratios (triton baseline / implementation)
+ triton_v2_ratio = time_ms_triton / time_ms_silu_v2
+ ratios.append(triton_v2_ratio)
+
+ print(
+ f"Completed: {config_label}:"
+ f" V2: {time_ms_silu_v2:.3f}ms,"
+ f" Triton: {time_ms_triton:.3f}ms"
+ )
+
+ all_silu_v2_results.append(silu_v2_results)
+ all_triton_results.append(triton_results)
+ all_ratios.append(ratios)
+
+ # Store results for combined plotting
+ all_results.append(
+ (
+ strategy_descriptions[strategy],
+ all_ratios,
+ all_silu_v2_results,
+ all_triton_results,
+ config_labels,
+ config_x_axis,
+ )
+ )
+
+ # Print summary table for this strategy
+ print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
+ print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
+ print("-" * 90)
+
+ for i, (E, T, H) in enumerate(configs):
+ # Get the first result for each config (simplifying for summary)
+ v2_time = silu_v2_results[i][0]
+ triton_time = triton_results[i][0]
+ triton_v2_speedup = triton_time / v2_time
+ config_label = f"E={E:3d},T={T:4d},H={H:4d}"
+ print(
+ f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
+ f"{triton_v2_speedup:8.2f}x"
+ )
+
+
+def create_total_tokens_plot(all_results):
+ num_strategies = len(all_results)
+ num_configs = len(configs)
+
+ fig, axs = plt.subplots(
+ num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
+ )
+
+ # Add main title to the entire figure
+ fig.suptitle(
+ "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
+ fontsize=18,
+ fontweight="bold",
+ y=0.98,
+ )
+
+ # Handle single strategy case
+ if num_strategies == 1:
+ axs = axs.reshape(1, -1)
+
+ # Handle single config case
+ if num_configs == 1:
+ axs = axs.reshape(-1, 2)
+
+ for strategy_idx, result in enumerate(all_results):
+ (
+ strategy_name,
+ all_ratios,
+ all_silu_v2_results,
+ all_triton_results,
+ config_labels,
+ config_x_axis,
+ ) = result
+
+ for config_idx in range(num_configs):
+ # Speedup plot (left column)
+ ax_speedup = axs[strategy_idx, config_idx * 2]
+ # Bandwidth plot (right column)
+ ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
+
+ E, T, H = configs[config_idx]
+ ratios = all_ratios[config_idx]
+ total_tokens_values = config_x_axis[config_idx]
+
+ # Extract speedup ratios
+ triton_v2_ratios = [ratio for ratio in ratios]
+
+ # Extract bandwidth percentages for all implementations
+ v2_bandwidth_percentages = [
+ result[3] for result in all_silu_v2_results[config_idx]
+ ]
+ triton_bandwidth_percentages = [
+ result[3] for result in all_triton_results[config_idx]
+ ]
+
+ # Plot speedup ratios vs total tokens (left plot)
+ ax_speedup.plot(
+ total_tokens_values,
+ triton_v2_ratios,
+ "go-",
+ linewidth=3,
+ markersize=8,
+ label="Triton/V2 Speedup",
+ )
+ ax_speedup.set_title(
+ f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
+ fontsize=12,
+ fontweight="bold",
+ )
+ ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+ ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
+ ax_speedup.legend(prop={"weight": "bold"})
+ ax_speedup.grid(True, alpha=0.3)
+
+ # Plot bandwidth utilization (right plot)
+ ax_bandwidth.plot(
+ total_tokens_values,
+ v2_bandwidth_percentages,
+ "o-",
+ linewidth=3,
+ markersize=8,
+ label="SiLU V2",
+ color="blue",
+ )
+ ax_bandwidth.plot(
+ total_tokens_values,
+ triton_bandwidth_percentages,
+ "o-",
+ linewidth=3,
+ markersize=8,
+ label="Triton",
+ color="green",
+ )
+ ax_bandwidth.set_title(
+ f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
+ fontsize=12,
+ fontweight="bold",
+ )
+ ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+ ax_bandwidth.set_ylabel(
+ "% of Peak Bandwidth", fontweight="bold", fontsize=11
+ )
+ ax_bandwidth.legend(prop={"weight": "bold"})
+ ax_bandwidth.grid(True, alpha=0.3)
+
+ # Format x-axis labels for both plots
+ for ax in [ax_speedup, ax_bandwidth]:
+ ax.set_xticks(total_tokens_values)
+ ax.set_xticklabels(
+ [
+ f"{tt // 1000}K" if tt >= 1000 else str(tt)
+ for tt in total_tokens_values
+ ],
+ fontweight="bold",
+ )
+ # Make tick labels bold
+ for label in ax.get_xticklabels() + ax.get_yticklabels():
+ label.set_fontweight("bold")
+
+ # Add value labels on Triton/V2 speedup points
+ for x, y in zip(total_tokens_values, triton_v2_ratios):
+ ax_speedup.annotate(
+ f"{y:.2f}x",
+ (x, y),
+ textcoords="offset points",
+ xytext=(0, -15),
+ ha="center",
+ fontsize=9,
+ fontweight="bold",
+ bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
+ )
+
+ plt.tight_layout()
+ plt.subplots_adjust(top=0.93) # Make room for main title
+ filename = "silu_benchmark_total_tokens_3way.png"
+ plt.savefig(filename, dpi=300, bbox_inches="tight")
+ plt.show()
+
+ return filename
+
+
+# Create comprehensive 3-way comparison plots
+combined_plot_filename = create_combined_plot(all_results)
+total_tokens_plot_filename = create_total_tokens_plot(all_results)
+
+print(f"\n{'=' * 80}")
+print("3-Way Benchmark Suite Complete!")
+print(f"Generated combined comparison plot: {combined_plot_filename}")
+print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
+print("Compared: SiLU V2 (CUDA), and Triton implementations")
+print(f"{'=' * 80}")
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 6ddab4621457..f7cdc25794ca 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -4,7 +4,6 @@
import csv
import os
from datetime import datetime
-from typing import Optional
import flashinfer
import torch
@@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_decode(
dtype: torch.dtype,
- quant_dtypes: tuple[
- Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
- ],
+ quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_len: int,
num_heads: tuple[int, int] = (64, 8),
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 131df74c7de1..7993354475fc 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -4,7 +4,6 @@
import csv
import os
from datetime import datetime
-from typing import Optional
import flashinfer
import torch
@@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_prefill(
dtype: torch.dtype,
- quant_dtypes: tuple[
- Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
- ],
+ quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
batch_size: int,
max_seq_len: int,
num_heads: tuple[int, int] = (64, 8),
diff --git a/benchmarks/kernels/benchmark_vision_rotary_emb.py b/benchmarks/kernels/benchmark_vision_rotary_emb.py
new file mode 100644
index 000000000000..0b4e7ddb0d4b
--- /dev/null
+++ b/benchmarks/kernels/benchmark_vision_rotary_emb.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import statistics
+import time
+
+import torch
+
+from vllm.model_executor.models.qwen2_vl import (
+ Qwen2VisionRotaryEmbedding,
+ apply_rotary_pos_emb_vision,
+ apply_rotary_pos_emb_vision_2c,
+)
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+
+def benchmark_vision_rotary(
+ batch_size: int,
+ seq_len: int,
+ num_heads: int,
+ head_size: int,
+ dtype: torch.dtype,
+ seed: int,
+ device: str,
+ warmup_iter: int = 10,
+ benchmark_iter: int = 100,
+) -> None:
+ current_platform.seed_everything(seed)
+ torch.set_default_device(device)
+
+ # Qwen2-VL uses rotary over half the head dim
+ rotary_dim = head_size // 2
+ rope = Qwen2VisionRotaryEmbedding(rotary_dim)
+ rope = rope.to(dtype=torch.float32, device=torch.get_default_device())
+ freqs = rope(seq_len)
+
+ q = torch.randn(batch_size, seq_len, num_heads, head_size, dtype=dtype)
+ k = torch.randn_like(q)
+
+ # warmup
+ for _ in range(warmup_iter):
+ apply_rotary_pos_emb_vision(q, freqs)
+ apply_rotary_pos_emb_vision(k, freqs)
+ apply_rotary_pos_emb_vision_2c(q, k, freqs)
+ torch.cuda.synchronize()
+
+ def time_op_cuda_events(fn) -> float:
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+ start_event.record()
+ fn()
+ end_event.record()
+ end_event.synchronize()
+ return start_event.elapsed_time(end_event) # ms
+
+ def time_op_cpu_timer(fn) -> float:
+ torch.cuda.synchronize() if torch.cuda.is_available() else None
+ start = time.perf_counter()
+ fn()
+ torch.cuda.synchronize() if torch.cuda.is_available() else None
+ return (time.perf_counter() - start) * 1000.0 # ms
+
+ timer = time_op_cuda_events if torch.cuda.is_available() else time_op_cpu_timer
+
+ # 1c path timing: apply to q and k separately
+ lat_1c: list[float] = []
+ for _ in range(benchmark_iter):
+ lat_1c.append(
+ timer(
+ lambda: (
+ apply_rotary_pos_emb_vision(q, freqs),
+ apply_rotary_pos_emb_vision(k, freqs),
+ )
+ )
+ )
+
+ # 2c path timing: apply to q and k together
+ lat_2c: list[float] = []
+ for _ in range(benchmark_iter):
+ lat_2c.append(timer(lambda: apply_rotary_pos_emb_vision_2c(q, k, freqs)))
+
+ mean_1c = statistics.mean(lat_1c)
+ mean_2c = statistics.mean(lat_2c)
+ med_1c = statistics.median(lat_1c)
+ med_2c = statistics.median(lat_2c)
+
+ print("== Vision Rotary Benchmark (1c vs 2c) ==")
+ print(
+ f"Config: batch={batch_size}, seqlen={seq_len}, "
+ f"heads={num_heads}, head_dim={head_size}, dtype={dtype}"
+ )
+ print(f"Iters: warmup={warmup_iter}, bench={benchmark_iter}")
+ print(f"1c (separated q and k): mean={mean_1c:.4f} ms, median={med_1c:.4f} ms")
+ print(f"2c (fused q and k): mean={mean_2c:.4f} ms, median={med_2c:.4f} ms")
+ print(f"Fusion speedup: {mean_1c / mean_2c:.3f}x")
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="Benchmark the 1c vs 2c vision rotary embedding paths."
+ )
+ parser.add_argument("--batch-size", type=int, default=1)
+ parser.add_argument("--seq-len", type=int, default=8192)
+ parser.add_argument("--num-heads", type=int, default=16)
+ parser.add_argument(
+ "--head-size",
+ type=int,
+ default=80,
+ )
+ parser.add_argument(
+ "--dtype",
+ type=str,
+ choices=["bfloat16", "float", "float16"],
+ default="bfloat16",
+ )
+ parser.add_argument("--device", type=str, default="cuda")
+ parser.add_argument("--seed", type=int, default=0)
+ parser.add_argument("--warmup-iter", type=int, default=10)
+ parser.add_argument("--benchmark-iter", type=int, default=1000)
+ args = parser.parse_args()
+
+ benchmark_vision_rotary(
+ batch_size=args.batch_size,
+ seq_len=args.seq_len,
+ num_heads=args.num_heads,
+ head_size=args.head_size,
+ dtype=getattr(torch, args.dtype),
+ seed=args.seed,
+ device=args.device,
+ warmup_iter=args.warmup_iter,
+ benchmark_iter=args.benchmark_iter,
+ )
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 98bde9d83c82..602fad181074 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -11,13 +11,13 @@
from typing import Any
import torch
-import triton
from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- _w8a8_block_fp8_matmul,
+ _w8a8_triton_block_scaled_mm,
)
from vllm.platforms import current_platform
+from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)
@@ -56,7 +56,7 @@ def w8a8_block_matmul(
Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization.
It should be 2-dim, e.g., [128, 128].
- output_dytpe: The dtype of the returned tensor.
+ output_dtype: The dtype of the returned tensor.
Returns:
torch.Tensor: The result of matmul.
@@ -83,7 +83,7 @@ def grid(META):
)
if A.dtype == torch.float8_e4m3fn:
- kernel = _w8a8_block_fp8_matmul
+ kernel = _w8a8_triton_block_scaled_mm
else:
raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
index b99c2099f2c3..ba31bc563829 100644
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# fmt: off
# ruff: noqa: E501
import time
@@ -8,27 +7,33 @@
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
- get_col_major_tma_aligned_tensor,
per_token_group_quant_fp8,
- w8a8_block_fp8_matmul,
+ w8a8_triton_block_scaled_mm,
)
from vllm.triton_utils import triton
-from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
+from vllm.utils.deep_gemm import (
+ calc_diff,
+ fp8_gemm_nt,
+ get_col_major_tma_aligned_tensor,
+ per_block_cast_to_fp8,
+)
-def benchmark_shape(m: int,
- n: int,
- k: int,
- warmup: int = 100,
- repeat: int = 10000,
- verbose: bool = False) -> dict:
+def benchmark_shape(
+ m: int,
+ n: int,
+ k: int,
+ warmup: int = 100,
+ repeat: int = 10000,
+ verbose: bool = False,
+) -> dict:
"""Benchmark all implementations for a specific (m, n, k) shape."""
if verbose:
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
# Create test tensors
- A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
- B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
+ A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+ B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
# Reference result in BF16
torch.cuda.synchronize()
@@ -45,34 +50,39 @@ def benchmark_shape(m: int,
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
- C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+ C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
- A, block_size[1], column_major_scales=True)
+ A, block_size[1], column_major_scales=True
+ )
# === DeepGEMM Implementation ===
def deepgemm_gemm():
- fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
- (B_deepgemm, B_scale_deepgemm),
- C_deepgemm)
+ fp8_gemm_nt(
+ (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
+ )
return C_deepgemm
# === vLLM Triton Implementation ===
def vllm_triton_gemm():
- return w8a8_block_fp8_matmul(A_vllm,
- B_vllm,
- A_scale_vllm,
- B_scale_vllm,
- block_size,
- output_dtype=torch.bfloat16)
+ return w8a8_triton_block_scaled_mm(
+ A_vllm,
+ B_vllm,
+ A_scale_vllm,
+ B_scale_vllm,
+ block_size,
+ output_dtype=torch.bfloat16,
+ )
# === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm():
- return ops.cutlass_scaled_mm(A_vllm_cutlass,
- B_vllm.T,
- scale_a=A_scale_vllm_cutlass,
- scale_b=B_scale_vllm.T,
- out_dtype=torch.bfloat16)
+ return ops.cutlass_scaled_mm(
+ A_vllm_cutlass,
+ B_vllm.T,
+ scale_a=A_scale_vllm_cutlass,
+ scale_b=B_scale_vllm.T,
+ out_dtype=torch.bfloat16,
+ )
# Run correctness check first
if verbose:
@@ -89,26 +99,23 @@ def vllm_cutlass_gemm():
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
- print("vLLM Triton vs DeepGEMM difference: "
- f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
- print("vLLM CUTLASS vs DeepGEMM difference: "
- f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
+ print(
+ "vLLM Triton vs DeepGEMM difference: "
+ f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
+ )
+ print(
+ "vLLM CUTLASS vs DeepGEMM difference: "
+ f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
+ )
# Benchmark implementations
implementations = {
"DeepGEMM": deepgemm_gemm,
"vLLM Triton": vllm_triton_gemm,
- "vLLM CUTLASS": vllm_cutlass_gemm
+ "vLLM CUTLASS": vllm_cutlass_gemm,
}
- benchmark_results = {
- "shape": {
- "m": m,
- "n": n,
- "k": k
- },
- "implementations": {}
- }
+ benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
for name, func in implementations.items():
# Warmup
@@ -136,38 +143,36 @@ def vllm_cutlass_gemm():
"tflops": tflops,
"gb_s": gb_s,
"diff": {
- "DeepGEMM":
- 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
- "Reference":
- deepgemm_diff if name == "DeepGEMM" else
- (vllm_triton_diff
- if name == "vLLM Triton" else vllm_cutlass_diff)
- }
+ "DeepGEMM": 0.0
+ if name == "DeepGEMM"
+ else calc_diff(func(), C_deepgemm),
+ "Reference": deepgemm_diff
+ if name == "DeepGEMM"
+ else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
+ },
}
if verbose:
- print(
- f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
- )
+ print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
# Calculate speedups
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
for name, data in benchmark_results["implementations"].items():
if name != "DeepGEMM":
speedup = baseline / data["time_ms"]
- benchmark_results["implementations"][name][
- "speedup_vs_deepgemm"] = speedup
+ benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
if verbose:
- print(f"DeepGEMM is {1/speedup:.2f}x "
- f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
+ print(
+ f"DeepGEMM is {1 / speedup:.2f}x "
+ f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
+ )
- vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
- "time_ms"]
- vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
- "time_ms"]
+ vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
+ vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
- benchmark_results["implementations"]["vLLM CUTLASS"][
- "speedup_vs_triton"] = cutlass_vs_triton
+ benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
+ cutlass_vs_triton
+ )
if verbose:
print(
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
@@ -179,8 +184,7 @@ def vllm_cutlass_gemm():
def format_table_row(values, widths):
"""Format a row with specified column widths."""
- return "| " + " | ".join(f"{val:{w}}"
- for val, w in zip(values, widths)) + " |"
+ return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
def print_table(headers, rows, title=None):
@@ -288,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["DeepGEMM"]
- deepgemm_rows.append([
- shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
- f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
- ])
+ deepgemm_rows.append(
+ [
+ shape["m"],
+ shape["n"],
+ shape["k"],
+ f"{impl_data['time_us']:.1f}",
+ f"{impl_data['tflops']:.1f}",
+ f"{impl_data['gb_s']:.1f}",
+ ]
+ )
- print_table(deepgemm_headers,
- deepgemm_rows,
- title="DeepGEMM Implementation:")
+ print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
# Print vLLM Triton table
- triton_headers = [
- "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
- ]
+ triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
triton_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["vLLM Triton"]
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
- triton_rows.append([
- shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
- f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
- format_speedup(speedup)
- ])
+ triton_rows.append(
+ [
+ shape["m"],
+ shape["n"],
+ shape["k"],
+ f"{impl_data['time_us']:.1f}",
+ f"{impl_data['tflops']:.1f}",
+ f"{impl_data['gb_s']:.1f}",
+ format_speedup(speedup),
+ ]
+ )
- print_table(triton_headers,
- triton_rows,
- title="vLLM Triton Implementation:")
+ print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
# Print vLLM CUTLASS table
cutlass_headers = [
- "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
- "vs Triton"
+ "m",
+ "n",
+ "k",
+ "Time (μs)",
+ "TFLOPS",
+ "GB/s",
+ "vs DeepGEMM",
+ "vs Triton",
]
cutlass_rows = []
for result in all_results:
@@ -327,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
impl_data = result["implementations"]["vLLM CUTLASS"]
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
- cutlass_rows.append([
- shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
- f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
- format_speedup(vs_deepgemm),
- format_speedup(vs_triton)
- ])
+ cutlass_rows.append(
+ [
+ shape["m"],
+ shape["n"],
+ shape["k"],
+ f"{impl_data['time_us']:.1f}",
+ f"{impl_data['tflops']:.1f}",
+ f"{impl_data['gb_s']:.1f}",
+ format_speedup(vs_deepgemm),
+ format_speedup(vs_triton),
+ ]
+ )
- print_table(cutlass_headers,
- cutlass_rows,
- title="vLLM CUTLASS Implementation:")
+ print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
# Calculate and print averages
print("\n===== AVERAGE PERFORMANCE =====")
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
avg_metrics = {
- impl: {
- "tflops": 0,
- "gb_s": 0,
- "time_ms": 0
- }
- for impl in implementations
+ impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
}
for result in all_results:
@@ -366,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
- avg_rows.append([
- impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
- ])
+ avg_rows.append(
+ [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
+ )
print_table(avg_headers, avg_rows)
@@ -376,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
avg_speedups = {
"DeepGEMM vs vLLM Triton": 0,
"DeepGEMM vs vLLM CUTLASS": 0,
- "vLLM CUTLASS vs vLLM Triton": 0
+ "vLLM CUTLASS vs vLLM Triton": 0,
}
for result in all_results:
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
- vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
- "time_ms"]
+ vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
- avg_speedups[
- "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
- avg_speedups[
- "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
- avg_speedups[
- "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
+ avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+ avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+ avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
+ vllm_triton_time / vllm_cutlass_time
+ )
print("\n===== AVERAGE SPEEDUPS =====")
speedup_headers = ["Comparison", "Speedup"]
@@ -408,8 +421,7 @@ def run_benchmarks(verbose: bool = False):
for result in all_results:
for impl in implementations:
- avg_diff[impl] += result["implementations"][impl]["diff"][
- "Reference"]
+ avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
diff_headers = ["Implementation", "Avg Diff vs Reference"]
diff_rows = []
diff --git a/benchmarks/kernels/moe_tune_script.sh b/benchmarks/kernels/moe_tune_script.sh
new file mode 100755
index 000000000000..acd2502e0587
--- /dev/null
+++ b/benchmarks/kernels/moe_tune_script.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+## ---- Mixtral fp8 tuning example ---- ##
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 1 --tune --dtype fp8_w8a8
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 2 --tune --dtype fp8_w8a8
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 4 --tune --dtype fp8_w8a8
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 8 --tune --dtype fp8_w8a8
+
+
+## ---- Mixtral fp16 tuning example ---- ##
+# we don't need --dtype fp16; it has been set as default for rocm in the script.
+
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 1 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 2 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 4 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 8 --tune
+
+
+
+## ---- After the tuning is finished ---- ##
+# The tuning script saves the configurations in a json file at the same directory from where you launch the script.
+# The name of the json file will look something like this: E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+#
+# [IMPORTANT] -> Once the tuning is complete, move the tuned config file(s) to the following path:
+# vllm/vllm/model_executor/layers/fused_moe/configs/
+
+
+## ---- Notes ---- ##
+# 1. The tuned file is specific for a TP size. This means a tuned file obtained for --tp-size 8 can only be used when running the model under TP=8 setting.
+# 2. The script uses Ray for multi-gpu tuning. Export HIP_VISIBLE_DEVICES accordingly to expose the required no. of GPUs and use multiple gpus for tuning.
+# 3. RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 resolves the following errors (depending on if HIP_VISIBLE_DEVICES is set or not):
+# - Error-1: RuntimeError: HIP error: invalid device ordinal
+# HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+# For debugging consider passing AMD_SERIALIZE_KERNEL=3
+# - Error-2: RuntimeError: HIP_VISIBLE_DEVICES contains more devices than ROCR_VISIBLE_DEVICES
+
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index 4bbb36bb4359..a9af811bbe9c 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
-from collections.abc import Iterable
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Iterable
+from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
@@ -55,7 +55,7 @@ def n_args(self):
def __init__(
self,
- cuda_graph_params: Optional[CudaGraphBenchParams],
+ cuda_graph_params: CudaGraphBenchParams | None,
label: str,
sub_label: str,
description: str,
diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index 7adf97bcf562..f5b5c6c97d48 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
----------------------------------------------------------------------------------------------------
```
+### JSON configuration file for synthetic conversations generation
+
+The input flag `--input-file` is used to determine the input conversations for the benchmark.
+When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
+
+The file `generate_multi_turn.json` is an example file.
+
+The file must contain the sections `prompt_input` and `prompt_output`.
+
+The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
+
+* `num_turns` - Number of total turns in the conversation (both user & assistant).
+The final value will always be rounded to an even number so each user turn has a reply.
+* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
+* `num_tokens` - Total token length of each **user** message (one turn).
+
+The `prompt_output` section must contain `num_tokens`:
+
+* `num_tokens` - Total token length of each **assistant** message (one turn).
+
+### Random distributions for synthetic conversations generation
+
+When creating an input JSON file (such as `generate_multi_turn.json`),
+every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.
+The distribution determines how to randomly sample values for the field.
+
+The available distributions are listed below.
+
+**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.
+Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
+
+#### constant
+
+```json
+{
+ "distribution": "constant",
+ "value": 500
+}
+```
+
+* `value` - the fixed integer value (always returns the same number).
+
+#### uniform
+
+```json
+{
+ "distribution": "uniform",
+ "min": 12,
+ "max": 18
+}
+```
+
+* `min` - minimum value (inclusive).
+* `max` - maximum value (inclusive), should be equal or larger than min.
+
+#### lognormal
+
+```json
+{
+ "distribution": "lognormal",
+ "average": 1000,
+ "max": 5000
+}
+```
+
+You can parameterize the lognormal distribution in one of two ways:
+
+Using the average and optional median ratio:
+
+* `average` - target average value of the distribution.
+* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
+
+Using the parameters of the underlying normal distribution:
+
+* `mean` - mean of the underlying normal distribution.
+* `sigma` - standard deviation of the underlying normal distribution.
+
+#### zipf
+
+```json
+{
+ "distribution": "zipf",
+ "alpha": 1.2,
+ "max": 100
+}
+```
+
+* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
+
+#### poisson
+
+```json
+{
+ "distribution": "poisson",
+ "alpha": 10,
+ "max": 50
+}
+```
+
+* `alpha` - expected value (λ). Also the variance of the distribution.
+
## ShareGPT Conversations
To run with the ShareGPT data, download the following ShareGPT dataset:
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
index 411b89dd23dc..2674899d1cc5 100644
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from statistics import mean
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
import numpy as np # type: ignore
import pandas as pd # type: ignore
@@ -35,8 +35,8 @@ def sample(self, size: int = 1) -> np.ndarray:
class UniformDistribution(Distribution):
def __init__(
self,
- min_val: Union[int, float],
- max_val: Union[int, float],
+ min_val: int | float,
+ max_val: int | float,
is_integer: bool = True,
) -> None:
self.min_val = min_val
@@ -56,7 +56,7 @@ def __repr__(self) -> str:
class ConstantDistribution(Distribution):
- def __init__(self, value: Union[int, float]) -> None:
+ def __init__(self, value: int | float) -> None:
self.value = value
self.max_val = value
@@ -68,7 +68,7 @@ def __repr__(self) -> str:
class ZipfDistribution(Distribution):
- def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+ def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
@@ -83,7 +83,7 @@ def __repr__(self) -> str:
class PoissonDistribution(Distribution):
- def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+ def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
@@ -99,21 +99,105 @@ def __repr__(self) -> str:
class LognormalDistribution(Distribution):
def __init__(
- self, mean: float, sigma: float, max_val: Optional[int] = None
+ self,
+ mean: float | None = None,
+ sigma: float | None = None,
+ average: int | None = None,
+ median_ratio: float | None = None,
+ max_val: int | None = None,
) -> None:
+ self.average = average
+ self.median_ratio = median_ratio
+ self.max_val = max_val
+
+ if average is not None:
+ if average < 1:
+ raise ValueError("Lognormal average must be positive")
+
+ if mean or sigma:
+ raise ValueError(
+ "When using lognormal average, you can't provide mean/sigma"
+ )
+
+ if self.median_ratio is None:
+ # Default value that provides relatively wide range of values
+ self.median_ratio = 0.85
+
+ # Calculate mean/sigma of np.random.lognormal based on the average
+ mean, sigma = self._generate_lognormal_by_median(
+ target_average=self.average, median_ratio=self.median_ratio
+ )
+ else:
+ if mean is None or sigma is None:
+ raise ValueError(
+ "Must provide both mean and sigma if average is not used"
+ )
+
+ if mean <= 0 or sigma < 0:
+ raise ValueError(
+ "Lognormal mean must be positive and sigma must be non-negative"
+ )
+
+ # Mean and standard deviation of the underlying normal distribution
+ # Based on numpy.random.lognormal
self.mean = mean
self.sigma = sigma
- self.max_val = max_val
+
+ @staticmethod
+ def _generate_lognormal_by_median(
+ target_average: int, median_ratio: float
+ ) -> tuple[float, float]:
+ """
+ Compute (mu, sigma) for a lognormal distribution given:
+ - a target average (mean of the distribution)
+ - a ratio of median / mean (controls skewness), assume mean > median
+
+ Background:
+ If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
+ * mean(X) = exp(mu + sigma^2 / 2)
+ * median(X) = exp(mu)
+
+ So:
+ median / mean = exp(mu) / exp(mu + sigma^2 / 2)
+ = exp(-sigma^2 / 2)
+
+ Rearranging:
+ sigma^2 = 2 * ln(mean / median)
+ mu = ln(median)
+
+ This gives a unique (mu, sigma) for any valid mean and median.
+ """
+ # Check input validity: median must be smaller than mean
+ if median_ratio <= 0 or median_ratio >= 1:
+ raise ValueError("median_ratio must be in range (0, 1)")
+
+ target_median = target_average * median_ratio
+
+ # Solve sigma^2 = 2 * ln(mean / median)
+ sigma = np.sqrt(2 * np.log(target_average / target_median))
+ mu = np.log(target_median)
+
+ return mu, sigma
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
+
+ if self.average is not None:
+ # Scale to average
+ samples *= self.average / samples.mean()
+
if self.max_val:
samples = np.minimum(samples, self.max_val)
return np.round(samples).astype(int)
def __repr__(self) -> str:
- return f"LognormalDistribution[{self.mean}, {self.sigma}]"
+ if self.average:
+ return (
+ f"LognormalDistribution[{self.average}, "
+ f"{self.median_ratio}, {self.max_val}]"
+ )
+ return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
class GenConvArgs(NamedTuple):
@@ -173,10 +257,21 @@ def get_random_distribution(
return PoissonDistribution(conf["alpha"], max_val=max_val)
elif distribution == "lognormal":
+ max_val = conf.get("max", None)
+
+ if "average" in conf:
+ # Infer lognormal mean/sigma (numpy) from input average
+ median_ratio = conf.get("median_ratio", None)
+ return LognormalDistribution(
+ average=conf["average"], median_ratio=median_ratio, max_val=max_val
+ )
+
+ # Use mean/sigma directly (for full control over the distribution)
verify_field_exists(conf, "mean", section, subsection)
verify_field_exists(conf, "sigma", section, subsection)
- max_val = conf.get("max", None)
- return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
+ return LognormalDistribution(
+ mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
+ )
elif distribution == "uniform":
verify_field_exists(conf, "min", section, subsection)
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 66d85eaf5131..67a085b40ed3 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -13,7 +13,7 @@
from enum import Enum
from http import HTTPStatus
from statistics import mean
-from typing import NamedTuple, Optional, Union
+from typing import NamedTuple
import aiohttp # type: ignore
import numpy as np # type: ignore
@@ -46,9 +46,9 @@ def __str__(self):
class ClientArgs(NamedTuple):
seed: int
- max_num_requests: Optional[int]
+ max_num_requests: int | None
skip_first_turn: bool
- max_turns: Optional[int]
+ max_turns: int | None
max_active_conversations: int
verbose: bool
print_content: bool
@@ -109,9 +109,9 @@ def __str__(self) -> str:
class MetricStats:
def __init__(self) -> None:
- self.min: Optional[float] = None
- self.max: Optional[float] = None
- self.avg: Optional[float] = None
+ self.min: float | None = None
+ self.max: float | None = None
+ self.avg: float | None = None
self.sum = 0.0
self.count = 0
@@ -143,7 +143,7 @@ def __init__(self, window_size: int) -> None:
self.index = 0
self.sum = 0.0
self.count = 0
- self.avg: Optional[float] = None
+ self.avg: float | None = None
def update(self, new_value: float) -> None:
if self.count < self.window_size:
@@ -169,7 +169,7 @@ def __repr__(self) -> str:
class DebugStats:
def __init__(self, logger: logging.Logger, window_size: int) -> None:
self.logger = logger
- self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
+ self.metrics: dict[str, MovingAverage | MetricStats] = {
"moving_avg_ttft_ms": MovingAverage(window_size),
"moving_avg_tpot_ms": MovingAverage(window_size),
"ttft_ms": MetricStats(),
@@ -198,14 +198,6 @@ def print(self) -> None:
self.logger.info("-" * 50)
-# Must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
- if text.startswith(prefix):
- return text[len(prefix) :]
- return text
-
-
def nanosec_to_millisec(value: float) -> float:
return value / 1000000.0
@@ -220,8 +212,8 @@ async def send_request(
chat_url: str,
model: str,
stream: bool = True,
- min_tokens: Optional[int] = None,
- max_tokens: Optional[int] = None,
+ min_tokens: int | None = None,
+ max_tokens: int | None = None,
) -> ServerResponse:
payload = {
"model": model,
@@ -250,9 +242,9 @@ async def send_request(
timeout = aiohttp.ClientTimeout(total=timeout_sec)
valid_response = True
- ttft: Optional[float] = None
+ ttft: float | None = None
chunk_delay: list[int] = []
- latency: Optional[float] = None
+ latency: float | None = None
first_chunk = ""
generated_text = ""
@@ -269,7 +261,7 @@ async def send_request(
if not chunk_bytes:
continue
- chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk == "[DONE]":
# End of stream
latency = time.perf_counter_ns() - start_time
@@ -364,7 +356,7 @@ async def send_turn(
req_args: RequestArgs,
verbose: bool,
verify_output: bool,
-) -> Optional[RequestStats]:
+) -> RequestStats | None:
assert messages_to_use > 0
assert messages_to_use <= len(conversation_messages)
@@ -644,7 +636,7 @@ async def client_main(
if args.verbose:
curr_time_sec: float = time.perf_counter()
- time_since_last_turn: Union[str, float] = "N/A"
+ time_since_last_turn: str | float = "N/A"
if conv_id in time_of_last_turn:
time_since_last_turn = round(
curr_time_sec - time_of_last_turn[conv_id], 3
@@ -769,7 +761,7 @@ def get_client_config(
"Number of conversations must be equal or larger than the number of clients"
)
- max_req_per_client: Optional[int] = None
+ max_req_per_client: int | None = None
if args.max_num_requests is not None:
# Max number of requests per client
req_per_client = args.max_num_requests // args.num_clients
@@ -936,13 +928,13 @@ async def main_mp(
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
)
- rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
+ rps: str | float = round(len(client_metrics) / runtime_sec, 3)
if len(client_metrics) < (5 * bench_args.num_clients):
# Do not estimate the RPS if the number of samples is very low
# (threshold can be tuned if needed)
rps = "N/A"
- runtime_left_sec: Union[str, float] = round(
+ runtime_left_sec: str | float = round(
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
)
if percent < 0.05:
@@ -1032,7 +1024,7 @@ def process_statistics(
warmup_percentages: list[float],
test_params: dict,
verbose: bool,
- gen_conv_args: Optional[GenConvArgs] = None,
+ gen_conv_args: GenConvArgs | None = None,
excel_output: bool = False,
) -> None:
if len(client_metrics) == 0:
@@ -1259,7 +1251,7 @@ async def main() -> None:
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
- "same as the ``--model`` argument. ",
+ "same as the `--model` argument. ",
)
parser.add_argument(
diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
index c3622c99a2e5..fccab4d0ce21 100644
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@@ -13,7 +13,7 @@
import json
import random
from statistics import mean
-from typing import Any, Optional
+from typing import Any
import pandas as pd # type: ignore
import tqdm # type: ignore
@@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
def content_is_valid(
- content: str, min_content_len: Optional[int], max_content_len: Optional[int]
+ content: str, min_content_len: int | None, max_content_len: int | None
) -> bool:
if min_content_len and len(content) < min_content_len:
return False
@@ -37,7 +37,7 @@ def content_is_valid(
def print_stats(
- conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
+ conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
) -> None:
# Collect statistics
stats = []
@@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
seed: int,
input_file: str,
output_file: str,
- max_items: Optional[int],
- min_content_len: Optional[int] = None,
- max_content_len: Optional[int] = None,
- min_turns: Optional[int] = None,
- max_turns: Optional[int] = None,
- model: Optional[str] = None,
+ max_items: int | None,
+ min_content_len: int | None = None,
+ max_content_len: int | None = None,
+ min_turns: int | None = None,
+ max_turns: int | None = None,
+ model: str | None = None,
) -> None:
if min_turns and max_turns:
assert min_turns <= max_turns
diff --git a/benchmarks/multi_turn/generate_multi_turn.json b/benchmarks/multi_turn/generate_multi_turn.json
index 274d03c2bdb2..03cfc7d63e8a 100644
--- a/benchmarks/multi_turn/generate_multi_turn.json
+++ b/benchmarks/multi_turn/generate_multi_turn.json
@@ -15,9 +15,8 @@
},
"prefix_num_tokens": {
"distribution": "lognormal",
- "mean": 6,
- "sigma": 4,
- "max": 1500
+ "average": 1000,
+ "max": 5000
},
"num_tokens": {
"distribution": "uniform",
diff --git a/benchmarks/profiling/README.md b/benchmarks/profiling/README.md
new file mode 100644
index 000000000000..ee65e8025cc5
--- /dev/null
+++ b/benchmarks/profiling/README.md
@@ -0,0 +1,57 @@
+# VLLM Benchmark Profiling
+
+This profiling directory provides a method to profile VLLM throughput and latency benchmarks using ROCm profiling utilities.
+
+## 1. Dependencies
+
+Before using the profiling feature, you need to install the required dependencies:
+
+### Install ROCm Profile Data
+
+```bash
+git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git
+cd rocmProfileData && make && sudo make install
+```
+
+### Install hipMarker
+
+```bash
+cd rocmProfileData/hipMarker && python3 setup.py install
+```
+
+## 2. Profiling Benchmarks
+
+Profiling can be used to monitor the performance of the VLLM benchmarks with ROCm. The key flags used for profiling are:
+
+- `--profile-rpd`: Profiles the generation process of a single batch.
+- `--profile-dir PROFILE_DIR`: Specifies the path to save the profiler output, which can later be visualized using tools like [ui.perfetto.dev](https://ui.perfetto.dev/) or [chrome.tracing](chrome://tracing/).
+
+### Profiling Using Default Directory
+
+By default, profiling results are saved in either `vllm_benchmark_latency_result` or `vllm_benchmark_throughput_result`. To run a benchmark and profile it using the default directory, execute:
+
+```bash
+python3 benchmark_throughput.py --input-len {len} --output-len {len} --model {model} --profile-rpd
+```
+
+### Profiling With a Custom Directory
+
+You can specify a custom directory for saving profiler outputs by using the `--profile-dir` flag:
+
+```bash
+python3 benchmark_throughput.py --input-len {len} --output-len {len} --model {model} --profile-rpd --profile-dir {/path/to/custom/dir}
+```
+
+After profiling is complete, an `.rpd` file containing the trace data will be saved to the specified directory.
+
+## 3. Convert Trace Data to JSON Format
+
+To view the trace data, it needs to be converted into a format that is compatible with tools like Chrome tracing or Perfetto.
+
+You can use the `rpd2tracing.py` script in rocmProfileData to convert the `.rpd` file into a JSON file:
+
+```bash
+python3 rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
+```
+
+Once the trace is converted, open the `.json` file in [Chrome](chrome://tracing/) or [Perfetto](https://ui.perfetto.dev/) for visualization.
diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py
new file mode 100644
index 000000000000..5df17ded53c7
--- /dev/null
+++ b/benchmarks/profiling/benchmark_latency.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark the latency of processing a single batch of requests."""
+
+import argparse
+import dataclasses
+import json
+import os
+import time
+from contextlib import contextmanager, nullcontext
+from pathlib import Path
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: argparse.Namespace):
+ print(args)
+
+ @contextmanager
+ def rpd_profiler_context():
+ from rpdTracerControl import rpdTracerControl as rpd
+
+ llm.start_profile()
+ yield
+ llm.stop_profile()
+ rpd.top_totals()
+
+ @contextmanager
+ def torch_profiler_context(profile_result_dir: str | None = None):
+ p = torch.profiler.profile(
+ activities=[
+ torch.profiler.ProfilerActivity.CPU,
+ torch.profiler.ProfilerActivity.CUDA,
+ ],
+ on_trace_ready=torch.profiler.tensorboard_trace_handler(
+ str(profile_result_dir)
+ ),
+ )
+ p.start()
+ try:
+ with torch.no_grad():
+ yield p
+ finally:
+ p.stop()
+ print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+
+ def get_profiling_context(profile_result_dir: str | None = None):
+ if args.profile_torch:
+ return torch_profiler_context(profile_result_dir)
+ elif args.profile_rpd:
+ return rpd_profiler_context()
+ else:
+ return nullcontext()
+
+ if args.profile_torch or args.profile_rpd:
+ profile_result_dir = Path(
+ args.profile_result_dir or "./vllm_benchmark_latency_result"
+ )
+ profile_result_dir.mkdir(parents=True, exist_ok=True)
+ name = os.path.basename(os.path.normpath(args.model))
+ model_trace_name = (
+ f"{name}_in_{args.input_len}_out_{args.output_len}_"
+ f"batch_{args.batch_size}_tp_{args.tensor_parallel_size}"
+ )
+ print(f"Profiling (results will be saved to '{profile_result_dir}')...")
+ if args.profile_rpd:
+ profile_result_dir /= f"{model_trace_name}.rpd"
+ os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_result_dir)
+
+ engine_args = EngineArgs.from_cli_args(args)
+
+ # NOTE(woosuk): If the request cannot be processed in a single batch,
+ # the engine will automatically process the request in multiple batches.
+ llm = LLM(**dataclasses.asdict(engine_args))
+
+ sampling_params = SamplingParams(
+ n=args.n,
+ temperature=1.0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=args.output_len,
+ )
+ print(sampling_params)
+ dummy_prompt_token_ids = np.random.randint(
+ 10000, size=(args.batch_size, args.input_len)
+ )
+ dummy_prompts: list[PromptType] = [
+ {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+ ]
+
+ def llm_generate():
+ if not args.use_beam_search:
+ llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
+ else:
+ llm.beam_search(
+ dummy_prompts,
+ BeamSearchParams(
+ beam_width=args.n,
+ max_tokens=args.output_len,
+ ignore_eos=True,
+ ),
+ )
+
+ def run_to_completion(profile_dir: str | None = None):
+ if profile_dir:
+ with get_profiling_context(profile_dir):
+ llm_generate()
+ else:
+ start_time = time.perf_counter()
+ llm_generate()
+ end_time = time.perf_counter()
+ latency = end_time - start_time
+ return latency
+
+ print("Warming up...")
+ for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+ run_to_completion(profile_dir=None)
+
+ if args.profile_torch or args.profile_rpd:
+ run_to_completion(profile_dir=profile_result_dir)
+ return
+
+ # Benchmark.
+ latencies = []
+ for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+ latencies.append(run_to_completion(profile_dir=None))
+ latencies = np.array(latencies)
+ percentages = [10, 25, 50, 75, 90, 99]
+ percentiles = np.percentile(latencies, percentages)
+ print(f"Avg latency: {np.mean(latencies)} seconds")
+ for percentage, percentile in zip(percentages, percentiles):
+ print(f"{percentage}% percentile latency: {percentile} seconds")
+
+ # Output JSON results if specified
+ if args.output_json:
+ results = {
+ "avg_latency": np.mean(latencies),
+ "latencies": latencies.tolist(),
+ "percentiles": dict(zip(percentages, percentiles.tolist())),
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="Benchmark the latency of processing a single batch of "
+ "requests till completion."
+ )
+ parser.add_argument("--input-len", type=int, default=32)
+ parser.add_argument("--output-len", type=int, default=128)
+ parser.add_argument("--batch-size", type=int, default=8)
+ parser.add_argument(
+ "--n", type=int, default=1, help="Number of generated sequences per prompt."
+ )
+ parser.add_argument("--use-beam-search", action="store_true")
+ parser.add_argument(
+ "--num-iters-warmup",
+ type=int,
+ default=10,
+ help="Number of iterations to run for warmup.",
+ )
+ parser.add_argument(
+ "--num-iters", type=int, default=30, help="Number of iterations to run."
+ )
+ parser.add_argument(
+ "--profile-torch",
+ action="store_true",
+ help="profile the generation process of a single batch",
+ )
+ parser.add_argument(
+ "--profile-rpd",
+ action="store_true",
+ help="profile the generation process of a single batch",
+ )
+ parser.add_argument(
+ "--profile-result-dir",
+ type=str,
+ default=os.getenv("VLLM_RPD_PROFILER_DIR", default=None),
+ help=(
+ "path to save the profiler output. Can be visualized "
+ "with ui.perfetto.dev or Tensorboard."
+ ),
+ )
+ parser.add_argument(
+ "--output-json",
+ type=str,
+ default=None,
+ help="Path to save the latency results in JSON format.",
+ )
+
+ parser = EngineArgs.add_cli_args(parser)
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py
new file mode 100644
index 000000000000..cfb4e587dd75
--- /dev/null
+++ b/benchmarks/profiling/benchmark_throughput.py
@@ -0,0 +1,636 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark offline inference throughput."""
+
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from contextlib import contextmanager, nullcontext
+from functools import cache
+from pathlib import Path
+
+import torch
+import uvloop
+from PIL import Image
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+ build_async_engine_client_from_engine_args,
+)
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+ """A class representing a single inference request for benchmarking.
+
+ Attributes:
+ prompt: The input text prompt for the model.
+ prompt_len: The length of the prompt in tokens.
+ expected_output_len: The expected length of the output in tokens.
+ multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+ images).
+ lora_request: Optional LoRARequest specifying the LoRA to use.
+ """
+
+ prompt: str
+ prompt_len: int
+ expected_output_len: int
+ multi_modal_data: MultiModalDataDict | None = None
+ lora_request: LoRARequest | None = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+ """Prepend and append special tokens around the question to form a prompt.
+
+ Args:
+ question: The input question text to wrap with special tokens
+ model: The name of the model being used, to determine which special
+ tokens to add
+
+ Returns:
+ The formatted prompt string with appropriate special tokens for the
+ model
+
+ Raises:
+ ValueError: If an unsupported model name is provided
+ """
+ model = model.lower()
+ if "pixtral" in model:
+ return f"[INST]{question}\n[IMG][/INST]"
+ raise ValueError(f"Unsupported model {model}")
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+ return get_adapter_absolute_path(lora_path)
+
+
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def get_random_lora_request(
+ args: argparse.Namespace,
+) -> tuple[LoRARequest, AnyTokenizer | None]:
+ global lora_tokenizer_cache
+ lora_id = random.randint(1, args.max_loras)
+ lora_request = LoRARequest(
+ lora_name=str(lora_id),
+ lora_int_id=lora_id,
+ lora_path=lora_path_on_disk(args.lora_path),
+ )
+ if lora_id not in lora_tokenizer_cache:
+ lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+ return lora_request, lora_tokenizer_cache[lora_id]
+
+
+def sample_requests(
+ tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
+) -> list[SampleRequest]:
+ dataset_path: str = args.dataset
+ num_requests: int = args.num_prompts
+ fixed_output_len: int | None = args.output_len
+ model: str = args.model
+ if fixed_output_len is not None and fixed_output_len < 4:
+ raise ValueError("output_len too small")
+
+ # Load the dataset.
+ with open(dataset_path) as f:
+ dataset = json.load(f)
+ # Filter out the conversations with less than 2 turns.
+ dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+ # Shuffle the dataset.
+ random.shuffle(dataset)
+
+ # Filter out sequences that are too long or too short
+ filtered_dataset: list[SampleRequest] = []
+ for data in tqdm(dataset, total=len(filtered_dataset), desc="sampling requests"):
+ if len(filtered_dataset) == num_requests:
+ break
+
+ # Only keep the first two turns of each conversation.
+ prompt = data["conversations"][0]["value"]
+ completion = data["conversations"][1]["value"]
+
+ multi_modal_data: MultiModalDataDict | None = None
+ if "image" in data:
+ multi_modal_data = multi_modal_data or {}
+ image_path = data["image"]
+ # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+ assert isinstance(image_path, str), "Only support single image input"
+ try:
+ multi_modal_data["image"] = Image.open(image_path).convert("RGB")
+ except FileNotFoundError:
+ # Ignore datapoint where asset is missing
+ continue
+ prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
+ request_tokenizer = tokenizer
+ lora_request: LoRARequest | None = None
+ if args.enable_lora:
+ lora_request, lora_tokenizer = get_random_lora_request(args)
+ if lora_tokenizer:
+ request_tokenizer = lora_tokenizer
+
+ # Tokenize the prompts and completions.
+ prompt_token_ids = request_tokenizer(prompt).input_ids
+ completion_token_ids = request_tokenizer(completion).input_ids
+ prompt_len = len(prompt_token_ids)
+ output_len = (
+ len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+ )
+ if prompt_len < 4 or output_len < 4:
+ # Prune too short sequences.
+ continue
+ if prompt_len > 1024 or prompt_len + output_len > 2048:
+ # Prune too long sequences.
+ continue
+ filtered_dataset.append(
+ SampleRequest(
+ prompt=prompt,
+ prompt_len=prompt_len,
+ expected_output_len=output_len,
+ multi_modal_data=multi_modal_data,
+ lora_request=lora_request,
+ )
+ )
+
+ return filtered_dataset
+
+
+def run_vllm(
+ requests: list[SampleRequest],
+ n: int,
+ engine_args: EngineArgs,
+) -> float:
+ from vllm import LLM, SamplingParams
+
+ @contextmanager
+ def rpd_profiler_context():
+ from rpdTracerControl import rpdTracerControl as rpd
+
+ llm.start_profile()
+ yield
+ llm.stop_profile()
+ rpd.top_totals()
+
+ @contextmanager
+ def torch_profiler_context(profile_dir: str | None = None):
+ p = torch.profiler.profile(
+ activities=[
+ torch.profiler.ProfilerActivity.CPU,
+ torch.profiler.ProfilerActivity.CUDA,
+ ],
+ on_trace_ready=torch.profiler.tensorboard_trace_handler(str(profile_dir)),
+ )
+ p.start()
+ try:
+ with torch.no_grad():
+ yield p
+ finally:
+ p.stop()
+ print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+
+ def get_profiling_context(profile_dir: str | None = None):
+ if args.profile_torch:
+ return torch_profiler_context(profile_dir)
+ elif args.profile_rpd:
+ return rpd_profiler_context()
+ else:
+ return nullcontext()
+
+ if args.profile_torch or args.profile_rpd:
+ profile_dir = Path(args.profile_dir or "./vllm_benchmark_throughput_result")
+ profile_dir.mkdir(parents=True, exist_ok=True)
+ name = os.path.basename(os.path.normpath(args.model))
+ model_trace_name = (
+ f"{name}_in_{args.input_len}_out_{args.output_len}_"
+ f"tp_{args.tensor_parallel_size}"
+ )
+ print(f"Profiling (results will be saved to '{profile_dir}')...")
+ if args.profile_rpd:
+ profile_dir /= f"{model_trace_name}.rpd"
+ os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_dir)
+
+ llm = LLM(**dataclasses.asdict(engine_args))
+
+ # Add the requests to the engine.
+ prompts: list[TextPrompt] = []
+ sampling_params: list[SamplingParams] = []
+ for request in requests:
+ prompts.append(
+ TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)
+ )
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=1.0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=request.expected_output_len,
+ )
+ )
+ lora_requests: list[LoRARequest] | None = None
+ if engine_args.enable_lora:
+ lora_requests = [request.lora_request for request in requests]
+
+ use_beam_search = False
+
+ if not use_beam_search:
+ execute = lambda: llm.generate(
+ prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+ )
+ else:
+ assert lora_requests is None, "BeamSearch API does not support LoRA"
+ prompts = [request.prompt for request in requests]
+ # output_len should be the same for all requests.
+ output_len = requests[0][2]
+ for request in requests:
+ assert request.expected_output_len == output_len
+ execute = lambda: llm.beam_search(
+ prompts,
+ BeamSearchParams(
+ beam_width=n,
+ max_tokens=output_len,
+ ignore_eos=True,
+ ),
+ )
+
+ if args.profile_torch or args.profile_rpd:
+ with get_profiling_context(profile_dir):
+ execute()
+ return
+ else:
+ start = time.perf_counter()
+ execute()
+ end = time.perf_counter()
+ return end - start
+
+
+async def run_vllm_async(
+ requests: list[SampleRequest],
+ n: int,
+ engine_args: AsyncEngineArgs,
+ disable_frontend_multiprocessing: bool = False,
+) -> float:
+ from vllm import SamplingParams
+
+ async with build_async_engine_client_from_engine_args(
+ engine_args, disable_frontend_multiprocessing
+ ) as llm:
+ # Add the requests to the engine.
+ prompts: list[TextPrompt] = []
+ sampling_params: list[SamplingParams] = []
+ lora_requests: list[LoRARequest | None] = []
+ for request in requests:
+ prompts.append(
+ TextPrompt(
+ prompt=request.prompt, multi_modal_data=request.multi_modal_data
+ )
+ )
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=1.0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=request.lora_requests,
+ )
+ )
+ lora_requests.append(request.lora_request)
+
+ generators = []
+ start = time.perf_counter()
+ for i, (prompt, sp, lr) in enumerate(
+ zip(prompts, sampling_params, lora_requests)
+ ):
+ generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
+ generators.append(generator)
+ all_gens = merge_async_iterators(*generators)
+ async for i, res in all_gens:
+ pass
+ end = time.perf_counter()
+ return end - start
+
+
+def run_hf(
+ requests: list[SampleRequest],
+ model: str,
+ tokenizer: PreTrainedTokenizerBase,
+ n: int,
+ max_batch_size: int,
+ trust_remote_code: bool,
+) -> float:
+ llm = AutoModelForCausalLM.from_pretrained(
+ model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+ )
+ if llm.config.model_type == "llama":
+ # To enable padding in the HF backend.
+ tokenizer.pad_token = tokenizer.eos_token
+ llm = llm.cuda()
+
+ pbar = tqdm(total=len(requests))
+ start = time.perf_counter()
+ batch: list[str] = []
+ max_prompt_len = 0
+ max_output_len = 0
+ for i in range(len(requests)):
+ prompt, prompt_len, output_len = requests[i]
+ # Add the prompt to the batch.
+ batch.append(prompt)
+ max_prompt_len = max(max_prompt_len, prompt_len)
+ max_output_len = max(max_output_len, output_len)
+ if len(batch) < max_batch_size and i != len(requests) - 1:
+ # Check if we can add more requests to the batch.
+ _, next_prompt_len, next_output_len = requests[i + 1]
+ if (
+ max(max_prompt_len, next_prompt_len)
+ + max(max_output_len, next_output_len)
+ ) <= 2048:
+ # We can add more requests to the batch.
+ continue
+
+ # Generate the sequences.
+ input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
+ llm_outputs = llm.generate(
+ input_ids=input_ids.cuda(),
+ do_sample=True,
+ num_return_sequences=n,
+ temperature=1.0,
+ top_p=1.0,
+ use_cache=True,
+ max_new_tokens=max_output_len,
+ )
+ # Include the decoding time.
+ tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+ pbar.update(len(batch))
+
+ # Clear the batch.
+ batch = []
+ max_prompt_len = 0
+ max_output_len = 0
+ end = time.perf_counter()
+ return end - start
+
+
+def run_mii(
+ requests: list[SampleRequest],
+ model: str,
+ tensor_parallel_size: int,
+ output_len: int,
+) -> float:
+ from mii import client, serve
+
+ llm = serve(model, tensor_parallel=tensor_parallel_size)
+ prompts = [request.prompt for request in requests]
+
+ start = time.perf_counter()
+ llm.generate(prompts, max_new_tokens=output_len)
+ end = time.perf_counter()
+ client = client(model)
+ client.terminate_server()
+ return end - start
+
+
+def main(args: argparse.Namespace):
+ print(args)
+ random.seed(args.seed)
+
+ # Sample the requests.
+ tokenizer = AutoTokenizer.from_pretrained(
+ args.tokenizer, trust_remote_code=args.trust_remote_code
+ )
+ if args.dataset is None:
+ vocab_size = tokenizer.vocab_size
+ requests = []
+ for _ in range(args.num_prompts):
+ request_tokenizer = tokenizer
+ lora_request: LoRARequest | None = None
+ if args.enable_lora:
+ lora_request, lora_tokenizer = get_random_lora_request(args)
+ if lora_tokenizer:
+ request_tokenizer = lora_tokenizer
+
+ # Synthesize a prompt with the given input length.
+ candidate_ids = [
+ random.randint(0, vocab_size - 1) for _ in range(args.input_len)
+ ]
+ # As tokenizer may add additional tokens like BOS, we need to try
+ # different lengths to get the desired input length.
+ for _ in range(5): # Max attempts to correct
+ candidate_prompt = request_tokenizer.decode(candidate_ids)
+ tokenized_len = len(request_tokenizer.encode(candidate_prompt))
+
+ if tokenized_len == args.input_len:
+ break
+
+ # Adjust length based on difference
+ diff = args.input_len - tokenized_len
+ if diff > 0:
+ candidate_ids.extend(
+ [random.randint(100, vocab_size - 100) for _ in range(diff)]
+ )
+ else:
+ candidate_ids = candidate_ids[:diff]
+ requests.append(
+ SampleRequest(
+ prompt=candidate_prompt,
+ prompt_len=args.input_len,
+ expected_output_len=args.output_len,
+ lora_request=lora_request,
+ )
+ )
+ else:
+ requests = sample_requests(tokenizer, args)
+
+ is_multi_modal = any(request.multi_modal_data is not None for request in requests)
+
+ if args.backend == "vllm":
+ if args.async_engine:
+ elapsed_time = uvloop.run(
+ run_vllm_async(
+ requests,
+ args.n,
+ AsyncEngineArgs.from_cli_args(args),
+ args.disable_frontend_multiprocessing,
+ )
+ )
+ else:
+ elapsed_time = run_vllm(requests, args.n, EngineArgs.from_cli_args(args))
+ elif args.backend == "hf":
+ assert args.tensor_parallel_size == 1
+ elapsed_time = run_hf(
+ requests,
+ args.model,
+ tokenizer,
+ args.n,
+ args.hf_max_batch_size,
+ args.trust_remote_code,
+ )
+ elif args.backend == "mii":
+ elapsed_time = run_mii(
+ requests, args.model, args.tensor_parallel_size, args.output_len
+ )
+ else:
+ raise ValueError(f"Unknown backend: {args.backend}")
+ total_num_tokens = sum(
+ request.prompt_len + request.expected_output_len for request in requests
+ )
+ total_output_tokens = sum(request.expected_output_len for request in requests)
+
+ if args.profile_torch or args.profile_rpd:
+ # Profiling complete
+ pass
+ else:
+ if is_multi_modal:
+ print(
+ "\033[91mWARNING\033[0m: Multi-modal request detected. The "
+ "following metrics are not accurate because image tokens are"
+ " not counted. See vllm-project/vllm/issues/9778 for details."
+ )
+ # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
+ print(
+ f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+ f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+ f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
+ )
+
+ # Output JSON results if specified
+ if args.output_json:
+ results = {
+ "elapsed_time": elapsed_time,
+ "num_requests": len(requests),
+ "total_num_tokens": total_num_tokens,
+ "requests_per_second": len(requests) / elapsed_time,
+ "tokens_per_second": total_num_tokens / elapsed_time,
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+ parser.add_argument(
+ "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
+ )
+ parser.add_argument(
+ "--dataset", type=str, default=None, help="Path to the dataset."
+ )
+ parser.add_argument(
+ "--input-len",
+ type=int,
+ default=None,
+ help="Input prompt length for each request",
+ )
+ parser.add_argument(
+ "--output-len",
+ type=int,
+ default=None,
+ help="Output length for each request. Overrides the "
+ "output length from the dataset.",
+ )
+ parser.add_argument(
+ "--n", type=int, default=1, help="Number of generated sequences per prompt."
+ )
+ parser.add_argument(
+ "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+ )
+ parser.add_argument(
+ "--hf-max-batch-size",
+ type=int,
+ default=None,
+ help="Maximum batch size for HF backend.",
+ )
+ parser.add_argument(
+ "--output-json",
+ type=str,
+ default=None,
+ help="Path to save the throughput results in JSON format.",
+ )
+ parser.add_argument(
+ "--async-engine",
+ action="store_true",
+ default=False,
+ help="Use vLLM async engine rather than LLM class.",
+ )
+ parser.add_argument(
+ "--disable-frontend-multiprocessing",
+ action="store_true",
+ default=False,
+ help="Disable decoupled async engine frontend.",
+ )
+ # LoRA
+ parser.add_argument(
+ "--lora-path",
+ type=str,
+ default=None,
+ help="Path to the lora adapters to use. This can be an absolute path, "
+ "a relative path, or a Hugging Face model identifier.",
+ )
+ parser.add_argument(
+ "--profile-torch",
+ action="store_true",
+ help="profile the generation process of a single batch",
+ )
+ parser.add_argument(
+ "--profile-rpd",
+ action="store_true",
+ help="profile the generation process of a single batch",
+ )
+ parser.add_argument(
+ "--profile-dir",
+ type=str,
+ default=None,
+ help=(
+ "path to save the profiler output. Can be visualized "
+ "with ui.perfetto.dev or Tensorboard."
+ ),
+ )
+
+ parser = AsyncEngineArgs.add_cli_args(parser)
+ args = parser.parse_args()
+ if args.tokenizer is None:
+ args.tokenizer = args.model
+ if args.dataset is None:
+ assert args.input_len is not None
+ assert args.output_len is not None
+ else:
+ assert args.input_len is None
+ if args.enable_lora:
+ assert args.lora_path is not None
+
+ if args.backend == "vllm":
+ if args.hf_max_batch_size is not None:
+ raise ValueError("HF max batch size is only for HF backend.")
+ elif args.backend == "hf":
+ if args.hf_max_batch_size is None:
+ raise ValueError("HF max batch size is required for HF backend.")
+ if args.quantization is not None:
+ raise ValueError("Quantization is only for vLLM backend.")
+ if args.enable_lora is not None:
+ raise ValueError("LoRA benchmarking is only supported for vLLM backend")
+ elif args.backend == "mii":
+ if args.dtype != "auto":
+ raise ValueError("dtype must be auto for MII backend.")
+ if args.n != 1:
+ raise ValueError("n must be 1 for MII backend.")
+ if args.quantization is not None:
+ raise ValueError("Quantization is only for vLLM backend.")
+ if args.hf_max_batch_size is not None:
+ raise ValueError("HF max batch size is only for HF backend.")
+ if args.tokenizer != args.model:
+ raise ValueError("Tokenizer must be the same as the model for MII backend.")
+ if args.enable_lora is not None:
+ raise ValueError("LoRA benchmarking is only supported for vLLM backend")
+ main(args)
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
deleted file mode 100644
index 65b1e09a247e..000000000000
--- a/benchmarks/pyproject.toml
+++ /dev/null
@@ -1,49 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
- # pycodestyle
- "E",
- # Pyflakes
- "F",
- # pyupgrade
- "UP",
- # flake8-bugbear
- "B",
- # flake8-simplify
- "SIM",
- # isort
- "I",
- # flake8-logging-format
- "G",
-]
-ignore = [
- # star imports
- "F405", "F403",
- # lambda expression assignment
- "E731",
- # Loop control variable not used within loop body
- "B007",
- # f-string format
- "UP032",
- # Can remove once 3.10+ is the minimum Python version
- "UP007",
-]
-
-[tool.ruff.lint.isort]
-known-first-party = ["vllm"]
-
-[tool.ruff.format]
-docstring-code-format = true
\ No newline at end of file
diff --git a/benchmarks/test_accuracy.py b/benchmarks/test_accuracy.py
new file mode 100644
index 000000000000..bc91173d2ddb
--- /dev/null
+++ b/benchmarks/test_accuracy.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import dataclasses
+
+# from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: argparse.Namespace):
+ print(args)
+
+ engine_args = EngineArgs.from_cli_args(args)
+
+ # NOTE(woosuk): If the request cannot be processed in a single batch,
+ # the engine will automatically process the request in multiple batches.
+ llm = LLM(**dataclasses.asdict(engine_args))
+
+ sampling_params = SamplingParams(
+ n=args.n,
+ temperature=0,
+ top_p=1.0,
+ ignore_eos=True,
+ max_tokens=args.output_len,
+ )
+ print(sampling_params)
+
+ # tokenizer = AutoTokenizer.from_pretrained(engine_args.model)
+ # inputs = tokenizer('Hello, world!', return_tensors='pt').input_ids
+ inputs = [
+ "Hello, my name is",
+ "The president of the United States is",
+ ("1 + " * 50) + " 1 = ", # Longer prompt.
+ "The capital of France is",
+ ]
+ # Prompt 0: 'Hello, my name is',
+ # Generated text: ' John and I am a 30-year-old man from the United States. I am a software engineer by profession and I have been working in the tech industry for about 5 years now. I am married to a wonderful woman named Sarah, and we have two beautiful children together. We live in a cozy little house in the suburbs, and we love spending time outdoors and exploring new places.\n\nI am a bit of a introvert and I enjoy spending time alone, reading books, watching movies, and playing video games. I am also a bit of a foodie and I love trying out new recipes and experimenting with different cuisines. I' # noqa: E501
+ # Prompt 1: 'The president of the United States is',
+ # Generated text: ' the head of state and head of government of the United States. The president directs the executive branch of the federal government and is the commander-in-chief of the United States Armed Forces.\nThe president is elected by the people through the Electoral College to a four-year term, and is one of only two nationally elected federal officers, the other being the Vice President of the United States. The Twenty-second Amendment to the United States Constitution prohibits anyone from being elected to the presidency more than twice.\nThe president is both the head of state and head of government of the United States, and is the leader of the executive branch of the federal government. The president' # noqa: E501
+ # Prompt 2: '1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 = ', # noqa: E501
+ # Generated text: "50\nThe answer is 50.<|start_header_id|>assistant<|end_header_id|>\n\nThat's correct!\n\nYou added 50 ones together, and the result is indeed 50. Well done!\n\nWould you like to try another math problem?<|start_header_id|>assistant<|end_header_id|>\n\nI can generate a new problem for you. Here it is:\n\n2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 + 2 = ?\n\nCan you add up all the" # noqa: E501
+ # Prompt 3: 'The capital of France is',
+ # Generated text: " a city of love, art, fashion, and cuisine. Paris, the City of Light, is a must-visit destination for anyone who appreciates beauty, history, and culture. From the iconic Eiffel Tower to the world-famous Louvre Museum, there's no shortage of things to see and do in this incredible city.\nHere are some of the top attractions and experiences to add to your Parisian itinerary:\n1. The Eiffel Tower: This iconic iron lattice tower is a symbol of Paris and one of the most recognizable landmarks in the world. Take the elevator to the top for breathtaking views of the city.\n2" # noqa: E501
+
+ outputs = llm.generate(inputs, sampling_params)
+ for i, output in enumerate(outputs):
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt {i}: {prompt!r}, Generated text: {generated_text!r}")
+ # print(tokenizer.decode(outputs[0]))
+
+
+if __name__ == "__main__":
+ parser = FlexibleArgumentParser(
+ description="Benchmark the latency of processing a single batch of "
+ "requests till completion."
+ )
+ parser.add_argument("--input-len", type=int, default=32)
+ parser.add_argument("--output-len", type=int, default=128)
+ parser.add_argument("--batch-size", type=int, default=8)
+ parser.add_argument(
+ "--n", type=int, default=1, help="Number of generated sequences per prompt."
+ )
+ parser.add_argument("--use-beam-search", action="store_true")
+ parser.add_argument(
+ "--num-iters-warmup",
+ type=int,
+ default=10,
+ help="Number of iterations to run for warmup.",
+ )
+ parser.add_argument(
+ "--num-iters", type=int, default=30, help="Number of iterations to run."
+ )
+ parser.add_argument(
+ "--profile",
+ action="store_true",
+ help="profile the generation process of a single batch",
+ )
+ parser.add_argument(
+ "--profile-result-dir",
+ type=str,
+ default=None,
+ help=(
+ "path to save the pytorch profiler output. Can be visualized "
+ "with ui.perfetto.dev or Tensorboard."
+ ),
+ )
+ parser.add_argument(
+ "--output-json",
+ type=str,
+ default=None,
+ help="Path to save the latency results in JSON format.",
+ )
+
+ parser = EngineArgs.add_cli_args(parser)
+ args = parser.parse_args()
+ main(args)
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 06494463223b..9bac5ea41c8d 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -101,6 +101,7 @@ else()
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND)
+ find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED)
@@ -177,8 +178,14 @@ elseif (S390_FOUND)
"-mzvector"
"-march=native"
"-mtune=native")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+ if(RVV_FOUND)
+ message(FAIL_ERROR "Can't support rvv now.")
+ else()
+ list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
+ endif()
else()
- message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
+ message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
endif()
#
@@ -191,13 +198,24 @@ else()
endif()
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
- FetchContent_Declare(
- oneDNN
- GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
- GIT_TAG v3.9
- GIT_PROGRESS TRUE
- GIT_SHALLOW TRUE
- )
+ set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
+
+ if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
+ message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
+ FetchContent_Declare(
+ oneDNN
+ SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
+ )
+ else()
+ message(STATUS "Downloading oneDNN from GitHub")
+ FetchContent_Declare(
+ oneDNN
+ GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+ GIT_TAG v3.9
+ GIT_PROGRESS TRUE
+ GIT_SHALLOW TRUE
+ )
+ endif()
if(USE_ACL)
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@@ -206,6 +224,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+ add_compile_definitions(VLLM_USE_ACL)
endif()
set(ONEDNN_LIBRARY_TYPE "STATIC")
@@ -258,7 +277,8 @@ set(VLLM_EXT_SRC
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
- "csrc/cpu/torch_bindings.cpp")
+ "csrc/cpu/torch_bindings.cpp"
+ "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC
@@ -300,4 +320,4 @@ define_gpu_extension_target(
WITH_SOABI
)
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index 02224cfe3ee8..c9e7aec880b9 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
else()
FetchContent_Declare(
flashmla
- GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
- GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
+ GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
+ GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
@@ -33,23 +33,64 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with
# sm90a
-cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
+
+set(SUPPORT_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
+ list(APPEND SUPPORT_ARCHS 9.0a)
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
+ list(APPEND SUPPORT_ARCHS 10.0a)
+endif()
+
+
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
+if(FLASH_MLA_ARCHS)
+ set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+ list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
+
set(FlashMLA_SOURCES
- ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
- ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
- ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
- ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
- ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
+ ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
+ ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
+ ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
+ ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
+ ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
+ )
+
+ set(FlashMLA_Extension_SOURCES
+ ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
+ ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
+ ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
+ )
set(FlashMLA_INCLUDES
+ ${flashmla_SOURCE_DIR}/csrc
+ ${flashmla_SOURCE_DIR}/csrc/sm90
+ ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+ ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+ )
+
+ set(FlashMLA_Extension_INCLUDES
+ ${flashmla_SOURCE_DIR}/csrc
+ ${flashmla_SOURCE_DIR}/csrc/sm90
+ ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
${flashmla_SOURCE_DIR}/csrc/cutlass/include
- ${flashmla_SOURCE_DIR}/csrc)
+ ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+ )
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+ set_gencode_flags_for_srcs(
+ SRCS "${FlashMLA_Extension_SOURCES}"
+ CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+
define_gpu_extension_target(
_flashmla_C
DESTINATION vllm
@@ -60,8 +101,32 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
USE_SABI 3
WITH_SOABI)
+
+ # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+ # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+ target_compile_options(_flashmla_C PRIVATE
+ $<$:-UPy_LIMITED_API>
+ $<$:-UPy_LIMITED_API>)
+
+ define_gpu_extension_target(
+ _flashmla_extension_C
+ DESTINATION vllm
+ LANGUAGE ${VLLM_GPU_LANG}
+ SOURCES ${FlashMLA_Extension_SOURCES}
+ COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
+ ARCHITECTURES ${VLLM_GPU_ARCHES}
+ INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
+ USE_SABI 3
+ WITH_SOABI)
+
+ # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+ # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+ target_compile_options(_flashmla_extension_C PRIVATE
+ $<$:-UPy_LIMITED_API>
+ $<$:-UPy_LIMITED_API>)
else()
- # Create an empty target for setup.py when not targeting sm90a systems
+ # Create empty targets for setup.py when not targeting sm90a systems
add_custom_target(_flashmla_C)
+ add_custom_target(_flashmla_extension_C)
endif()
diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake
new file mode 100644
index 000000000000..5a59a409999a
--- /dev/null
+++ b/cmake/external_projects/qutlass.cmake
@@ -0,0 +1,97 @@
+include(FetchContent)
+
+set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
+
+if(DEFINED ENV{QUTLASS_SRC_DIR})
+ set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
+endif()
+
+if(QUTLASS_SRC_DIR)
+ FetchContent_Declare(
+ qutlass
+ SOURCE_DIR ${QUTLASS_SRC_DIR}
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ )
+else()
+ FetchContent_Declare(
+ qutlass
+ GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
+ GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
+ GIT_PROGRESS TRUE
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ )
+endif()
+
+FetchContent_Populate(qutlass)
+
+if(NOT qutlass_SOURCE_DIR)
+ message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
+endif()
+message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
+
+cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
+
+ if(QUTLASS_ARCHS MATCHES "10\\.0a")
+ set(QUTLASS_TARGET_CC 100)
+ elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
+ set(QUTLASS_TARGET_CC 120)
+ else()
+ message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
+ endif()
+
+ set(QUTLASS_SOURCES
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
+ )
+
+ set(QUTLASS_INCLUDES
+ ${qutlass_SOURCE_DIR}
+ ${qutlass_SOURCE_DIR}/qutlass
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/include
+ ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
+ )
+
+ if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
+ list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
+ elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
+ list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
+ message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
+ else()
+ message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
+ "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
+ endif()
+
+ set_gencode_flags_for_srcs(
+ SRCS "${QUTLASS_SOURCES}"
+ CUDA_ARCHS "${QUTLASS_ARCHS}"
+ )
+
+ target_sources(_C PRIVATE ${QUTLASS_SOURCES})
+ target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
+ target_compile_definitions(_C PRIVATE
+ QUTLASS_DISABLE_PYBIND=1
+ TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
+ )
+
+ set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
+ $<$:--expt-relaxed-constexpr --use_fast_math -O3>
+ )
+
+else()
+ if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
+ message(STATUS
+ "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
+ else()
+ message(STATUS
+ "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
+ "CUDA_ARCHS='${CUDA_ARCHS}'.")
+ endif()
+endif()
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 3d32121f13ac..931090db50e9 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
+ GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/hipify.py b/cmake/hipify.py
index 55d378f5b111..8504f9defee9 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -16,7 +16,7 @@
from torch.utils.hipify.hipify_python import hipify
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Project directory where all the source + include files live.
@@ -34,15 +34,14 @@
)
# Source files to convert.
- parser.add_argument("sources",
- help="Source files to hipify.",
- nargs="*",
- default=[])
+ parser.add_argument(
+ "sources", help="Source files to hipify.", nargs="*", default=[]
+ )
args = parser.parse_args()
# Limit include scope to project_dir only
- includes = [os.path.join(args.project_dir, '*')]
+ includes = [os.path.join(args.project_dir, "*")]
# Get absolute path for all source files.
extra_files = [os.path.abspath(s) for s in args.sources]
@@ -51,25 +50,31 @@
# The directory might already exist to hold object files so we ignore that.
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
- hipify_result = hipify(project_directory=args.project_dir,
- output_directory=args.output_dir,
- header_include_dirs=[],
- includes=includes,
- extra_files=extra_files,
- show_detailed=True,
- is_pytorch_extension=True,
- hipify_extra_files_only=True)
+ hipify_result = hipify(
+ project_directory=args.project_dir,
+ output_directory=args.output_dir,
+ header_include_dirs=[],
+ includes=includes,
+ extra_files=extra_files,
+ show_detailed=True,
+ is_pytorch_extension=True,
+ hipify_extra_files_only=True,
+ )
hipified_sources = []
for source in args.sources:
s_abs = os.path.abspath(source)
- hipified_s_abs = (hipify_result[s_abs].hipified_path if
- (s_abs in hipify_result
- and hipify_result[s_abs].hipified_path is not None)
- else s_abs)
+ hipified_s_abs = (
+ hipify_result[s_abs].hipified_path
+ if (
+ s_abs in hipify_result
+ and hipify_result[s_abs].hipified_path is not None
+ )
+ else s_abs
+ )
hipified_sources.append(hipified_s_abs)
- assert (len(hipified_sources) == len(args.sources))
+ assert len(hipified_sources) == len(args.sources)
# Print hipified source files.
print("\n".join(hipified_sources))
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 9c0ed1d09572..f6a0d2b75be1 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
list(REMOVE_DUPLICATES _PTX_ARCHS)
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
- # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
- # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
+ # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+ # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
set(_CUDA_ARCHS)
foreach(_arch ${_SRC_CUDA_ARCHS})
- if(_arch MATCHES "\\a$")
+ if(_arch MATCHES "[af]$")
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
- string(REPLACE "a" "" _base "${_arch}")
+ string(REGEX REPLACE "[af]$" "" _base "${_arch}")
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
list(APPEND _CUDA_ARCHS "${_arch}")
@@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
endif()
- set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
target_compile_options(${GPU_MOD_NAME} PRIVATE
$<$:${GPU_COMPILE_FLAGS}>)
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 000000000000..304c0be8105f
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,12 @@
+codecov:
+ require_ci_to_pass: false
+
+fixes:
+ # Map source code paths to repository root paths
+ # Wildcards match any Python version (python3.*)
+ - "/vllm-workspace/src/vllm/::vllm/"
+ - "/vllm-workspace/vllm/::vllm/"
+ - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
+ - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
+ - "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
+ - "/usr/lib/python3.*/site-packages/vllm/::vllm/"
diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh
index 57382c1ddc65..052ff168cec4 100644
--- a/csrc/attention/attention_kernels.cuh
+++ b/csrc/attention/attention_kernels.cuh
@@ -28,10 +28,10 @@
#ifdef USE_ROCM
#include
- #include "../quantization/fp8/amd/quant_utils.cuh"
+ #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
typedef __hip_bfloat16 __nv_bfloat16;
#else
- #include "../quantization/fp8/nvidia/quant_utils.cuh"
+ #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
diff --git a/csrc/attention/mla/cutlass_mla_entry.cu b/csrc/attention/mla/cutlass_mla_entry.cu
deleted file mode 100644
index 0319d1daf302..000000000000
--- a/csrc/attention/mla/cutlass_mla_entry.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include
-
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
- torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale);
-#endif
-
-void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale) {
-#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
- return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale);
-#endif
- TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
-}
diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
deleted file mode 100644
index 9d05d910dd81..000000000000
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include
-
-#include
-#include
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.h"
-
-#include "cutlass_extensions/common.hpp"
-
-#include "device/sm100_mla.hpp"
-#include "kernel/sm100_mla_tile_scheduler.hpp"
-
-using namespace cute;
-using namespace cutlass::fmha::kernel;
-
-template
-struct MlaSm100 {
- using Element = T;
- using ElementAcc = float;
- using ElementOut = T;
-
- using TileShape = Shape<_128, _128, Shape<_512, _64>>;
- using TileShapeH = cute::tuple_element_t<0, TileShape>;
- using TileShapeD = cute::tuple_element_t<2, TileShape>;
-
- // H K (D_latent D_rope) B
- using ProblemShape = cute::tuple;
-
- using StrideQ = cute::tuple; // H D B
- using StrideK = cute::tuple; // K D B
- using StrideO = StrideK; // H D B
- using StrideLSE = cute::tuple<_1, int>; // H B
-
- using TileScheduler =
- std::conditional_t;
-
- using FmhaKernel =
- cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
- TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
- /*kIsCpAsync=*/true>;
- using Fmha = cutlass::fmha::device::MLA;
-};
-
-template
-typename T::Fmha::Arguments args_from_options(
- at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
- at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
- at::Tensor const& page_table, double scale) {
- cutlass::KernelHardwareInfo hw_info;
- hw_info.device_id = q_nope.device().index();
- hw_info.sm_count =
- cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
- hw_info.device_id);
-
- int batches = q_nope.sizes()[0];
- int page_count_per_seq = page_table.sizes()[1];
- int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
- int page_size = kv_c_and_k_pe_cache.sizes()[1];
- int max_seq_len = page_size * page_count_per_seq;
- using TileShapeH = typename T::TileShapeH;
- using TileShapeD = typename T::TileShapeD;
- auto problem_shape =
- cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
-
- auto [H, K, D, B] = problem_shape;
- auto [D_latent, D_rope] = D;
-
- using StrideQ = typename T::StrideQ;
- using StrideK = typename T::StrideK;
- using StrideO = typename T::StrideO;
- using StrideLSE = typename T::StrideLSE;
-
- StrideQ stride_Q_latent = cute::make_tuple(
- static_cast(D_latent), _1{}, static_cast(H * D_latent));
- StrideQ stride_Q_rope = cute::make_tuple(static_cast(D_rope), _1{},
- static_cast(H * D_rope));
- StrideK stride_C =
- cute::make_tuple(static_cast(D_latent + D_rope), _1{},
- static_cast(page_size * (D_latent + D_rope)));
- StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
- StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast(H));
- StrideO stride_O = cute::make_tuple(static_cast(D_latent), _1{},
- static_cast(H * D_latent));
-
- using Element = typename T::Element;
- using ElementOut = typename T::ElementOut;
- using ElementAcc = typename T::ElementAcc;
- auto Q_latent_ptr = static_cast(q_nope.data_ptr());
- auto Q_rope_ptr = static_cast(q_pe.data_ptr());
- auto C_ptr = static_cast(kv_c_and_k_pe_cache.data_ptr());
- auto scale_f = static_cast(scale);
- typename T::Fmha::Arguments arguments{
- problem_shape,
- {scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
- stride_C, C_ptr + D_latent, stride_C,
- static_cast(seq_lens.data_ptr()),
- static_cast(page_table.data_ptr()), stride_PT, page_count_total,
- page_size},
- {static_cast(out.data_ptr()), stride_O,
- static_cast(nullptr), stride_LSE},
- hw_info,
- 1, // split_kv
- nullptr, // is_var_split_kv
- };
- // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
- // split_kv automatically based on batch size and sequence length to balance
- // workload across available SMs. Consider using var_split_kv for manual
- // control if needed.
- T::Fmha::set_split_kv(arguments);
- return arguments;
-}
-
-template
-void runMla(at::Tensor const& out, at::Tensor const& q_nope,
- at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
- at::Tensor const& seq_lens, at::Tensor const& page_table,
- float scale, cudaStream_t stream) {
- using MlaSm100Type = MlaSm100;
- typename MlaSm100Type::Fmha fmha;
- auto arguments = args_from_options(
- out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
- size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
- auto const workspace_options =
- torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
- auto workspace = torch::empty(workspace_size, workspace_options);
-
- CUTLASS_CHECK(fmha.can_implement(arguments));
-
- CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
-
- CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
-}
-
-void cutlass_mla_decode_sm100a(torch::Tensor const& out,
- torch::Tensor const& q_nope,
- torch::Tensor const& q_pe,
- torch::Tensor const& kv_c_and_k_pe_cache,
- torch::Tensor const& seq_lens,
- torch::Tensor const& page_table, double scale) {
- TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
- TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
- TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
- TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
- "kv_c_and_k_pe_cache must be a 3D tensor");
- TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
- TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
- TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
-
- auto B_q_nope = q_nope.size(0);
- auto H_q_nope = q_nope.size(1);
- auto D_q_nope = q_nope.size(2);
- auto B_q_pe = q_pe.size(0);
- auto H_q_pe = q_pe.size(1);
- auto D_q_pe = q_pe.size(2);
- auto B_pt = page_table.size(0);
- auto PAGE_NUM = page_table.size(1);
- auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
- auto D_ckv = kv_c_and_k_pe_cache.size(2);
- auto B_o = out.size(0);
- auto H_o = out.size(1);
- auto D_o = out.size(2);
-
- TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
- TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
- TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
- TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
- "H_q_nope, H_q_pe, and H_o must be equal to 128");
- TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
- "PAGE_SIZE must be a power of 2");
- TORCH_CHECK(
- B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
- "Batch dims must be same for page_table, q_nope and q_pe, and out");
- TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
- "PAGE_NUM must be divisible by 128 / PAGE_SIZE");
- TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
-
- TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
- q_nope.dtype() == at::ScalarType::BFloat16 ||
- q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
- "q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
- TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
- q_nope.dtype() == q_pe.dtype(),
- "kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
- TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
- "seq_lens must be a 32-bit integer tensor");
- TORCH_CHECK(page_table.dtype() == torch::kInt32,
- "page_table must be a 32-bit integer tensor");
-
- auto in_dtype = q_nope.dtype();
- const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
- const cudaStream_t stream =
- at::cuda::getCurrentCUDAStream(q_nope.get_device());
- if (in_dtype == at::ScalarType::Half) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
- page_table, scale, stream);
- } else if (in_dtype == at::ScalarType::BFloat16) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale, stream);
- } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
- runMla(out, q_nope, q_pe, kv_c_and_k_pe_cache,
- seq_lens, page_table, scale, stream);
- } else {
- TORCH_CHECK(false, "Unsupported input data type of MLA");
- }
-}
diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
index 95e32559cd54..2d4b4a67d242 100644
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@@ -125,24 +125,37 @@ class MLA {
}
static void set_split_kv (KernelArguments& args) {
- // printf("set_split_kv start");
if (args.split_kv >= 1) return;
auto [H, K, D, B] = args.problem_shape;
- // std::cout << H << " " << K << " " << D << " " << B << "\n";
int sm_count = args.hw_info.sm_count;
- // printf(" sm_count = %d\n", sm_count);
- int max_splits = ceil_div(K, 128);
- max_splits = min(16, max_splits);
- // printf(" max_splits = %d\n", max_splits);
+ float seq_length_k = static_cast(K) / 1024.0f;
+ int max_splits = 1;
+
+ if (B <= 4 && seq_length_k >= 16) {
+ max_splits = 16;
+ }
+ else if (B <= 8 && seq_length_k >= 4) {
+ max_splits = 8;
+ }
+ else if ((B <= 16 && seq_length_k >= 8) ||
+ (B == 48 && seq_length_k >= 32)) {
+ max_splits = 4;
+ }
+ else if ((B <= 32 && seq_length_k >= 16) ||
+ (B == 96 && seq_length_k >= 16)) {
+ max_splits = 2;
+ }
+ else {
+ max_splits = 1;
+ }
+
+ // Wave-aware scheduling: ensure integer number of waves in K dimension
int sms_per_batch = max(1, sm_count / B);
- // printf(" sms_per_batch = %d\n", sms_per_batch);
int split_heur = min(max_splits, sms_per_batch);
int waves = ceil_div(B * split_heur, sm_count);
int k_waves = ceil_div(max_splits, split_heur);
int split_wave_aware = ceil_div(max_splits, k_waves);
args.split_kv = split_wave_aware;
- // printf(" args.split_kv = %d\n", args.split_kv);
-
}
/// Determines whether the GEMM can execute the given problem.
diff --git a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
index 2cbc2379579e..1f62c37ba4b7 100644
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
+ if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
}
- if (local_split_kv <= get<3>(blk_coord))
- continue;
+ if (local_split_kv <= get<3>(blk_coord))
+ continue;
load_page_table(
blk_coord,
problem_shape,
params.mainloop,
shared_storage.tensors,
pipeline_page_table, pipeline_pt_producer_state,
- local_split_kv
+ local_split_kv
);
}
}
@@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
- auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto problem_shape = params.problem_shape;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
+ if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
load_cpasync(
blk_coord,
@@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
params.mainloop_params,
shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state,
- local_split_kv,
+ local_split_kv,
/* must be shared pipe */
pipeline_page_table, pipeline_pt_consumer_state
);
@@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
- auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto problem_shape = params.problem_shape;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
- local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
- }
+ if (params.ptr_split_kv != nullptr) {
+ local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+ }
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
load_tma* paged= */ true>(
blk_coord,
@@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state,
pipeline_load_qk, pipeline_load_qk_producer_state,
- local_split_kv
+ local_split_kv
);
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
}
@@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
CUTLASS_PRAGMA_NO_UNROLL
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
- auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto problem_shape = params.problem_shape;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
+ if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
- }
+ }
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
load_tma(
blk_coord,
@@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
shared_storage.tensors,
pipeline_load_qk, pipeline_load_qk_producer_state,
pipeline_load_qk, pipeline_load_qk_producer_state,
- local_split_kv
+ local_split_kv
);
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
}
@@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape;
- auto local_split_kv = params.split_kv;
+ auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
mma(blk_coord,
problem_shape,
@@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
pipeline_mma_s, pipeline_mma_s_producer_state,
pipeline_p_mma, pipeline_p_mma_consumer_state,
pipeline_mma_o, pipeline_mma_o_producer_state,
- local_split_kv
+ local_split_kv
);
}
}
@@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
auto blk_coord = tile_scheduler.get_block_coord();
auto problem_shape = params.problem_shape;
- auto split_kv = params.split_kv;
- auto local_split_kv = split_kv;
+ auto split_kv = params.split_kv;
+ auto local_split_kv = split_kv;
if (params.mainloop.ptr_seq != nullptr) {
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
- if (params.ptr_split_kv != nullptr) {
+ if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
}
- if (local_split_kv <= get<3>(blk_coord))
+ if (local_split_kv <= get<3>(blk_coord))
continue;
compute(
blk_coord,
@@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
pipeline_mma_s, pipeline_mma_s_consumer_state,
pipeline_p_mma, pipeline_p_mma_producer_state,
pipeline_mma_o, pipeline_mma_o_consumer_state,
- local_split_kv
+ local_split_kv
);
}
@@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
cutlass::arch::NamedBarrier(
(kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
kNamedBarrierEpilogue
- ).arrive();
+ ).arrive_and_wait();
return;
}
diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index c60f1823b8a1..d1874515cc8f 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -43,6 +43,7 @@ void sm100_cutlass_mla_decode(
torch::Tensor const& seq_lens,
torch::Tensor const& page_table,
torch::Tensor const& workspace,
+ double sm_scale,
int64_t num_kv_splits) {
TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
}
diff --git a/csrc/cache.h b/csrc/cache.h
index fd230bec27fc..b162a4a2bc31 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -56,3 +56,19 @@ void cp_gather_cache(
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, std::optional seq_starts = std::nullopt);
+
+// Indexer K quantization and cache function
+void indexer_k_quant_and_cache(
+ torch::Tensor& k, // [num_tokens, head_dim]
+ torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
+ torch::Tensor& slot_mapping, // [num_tokens]
+ int64_t quant_block_size, // quantization block size
+ const std::string& scale_fmt);
+
+// Extract function to gather quantized K cache
+void cp_gather_indexer_k_quant_cache(
+ const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride]
+ torch::Tensor& dst_k, // [num_tokens, head_dim]
+ torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4]
+ const torch::Tensor& block_table, // [batch_size, num_blocks]
+ const torch::Tensor& cu_seq_lens); // [batch_size + 1]
\ No newline at end of file
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 80b4c47c5547..72a7ae4111f1 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -9,15 +9,14 @@
#include "quantization/vectorization_utils.cuh"
#ifdef USE_ROCM
- #include "quantization/fp8/amd/quant_utils.cuh"
+ #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
#else
- #include "quantization/fp8/nvidia/quant_utils.cuh"
+ #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
#endif
#include
#include
-#include