Skip to content

Commit 3e90eab

Browse files
authored
Enable Xeon optimizations like Tensor Parallel and AMX from vLLM 0.10.0 (#2106)
Signed-off-by: Tsai, Louie <[email protected]>
1 parent c133b2f commit 3e90eab

File tree

4 files changed

+76
-0
lines changed

4 files changed

+76
-0
lines changed

ChatQnA/docker_compose/intel/cpu/xeon/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ CPU example with Open Telemetry feature:
7373
docker compose -f compose.yaml -f compose.telemetry.yaml up -d
7474
```
7575

76+
To enable Xeon Optimization like AMX or Tensor Parallel for vLLM, compose.perf.yaml file need to be merged along with default compose.yaml file.
77+
CPU example with optimized vLLM feature:
78+
79+
```bash
80+
docker compose -f compose.yaml -f compose.perf.yaml up -d
81+
```
82+
7683
**Note**: developers should build docker image from source when:
7784

7885
- Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
services:
5+
vllm-service:
6+
image: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.10.0
7+
environment:
8+
VLLM_CPU_SGL_KERNEL: 1
9+
entrypoint: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
10+
command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --dtype bfloat16 --distributed-executor-backend mp --block-size 128 --enforce-eager --tensor-parallel-size $TP_NUM --pipeline-parallel-size $PP_NUM --max-num-batched-tokens $MAX_BATCHED_TOKENS --max-num-seqs $MAX_SEQS
11+
vllm-ci-test:
12+
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f-cpu
13+
container_name: vllm-ci-test
14+
volumes:
15+
- "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
16+
shm_size: 128g
17+
environment:
18+
no_proxy: ${no_proxy}
19+
http_proxy: ${http_proxy}
20+
https_proxy: ${https_proxy}
21+
HF_TOKEN: ${HF_TOKEN}
22+
LLM_MODEL_ID: ${LLM_MODEL_ID}
23+
VLLM_CPU_KVCACHE_SPACE: 40
24+
ON_CPU: 1
25+
REMOTE_HOST: vllm-service
26+
REMOTE_PORT: 80
27+
entrypoint: tail -f /dev/null

ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,19 @@ export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
3737
pushd "grafana/dashboards" > /dev/null
3838
source download_opea_dashboard.sh
3939
popd > /dev/null
40+
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
41+
echo $numa_count
42+
if (( numa_count % 2 == 0 )); then
43+
if (( numa_count == 6 )); then
44+
export TP_NUM=2
45+
export PP_NUM=3
46+
else
47+
export TP_NUM=$numa_count
48+
export PP_NUM=1
49+
fi
50+
else
51+
export PP_NUM=$numa_count
52+
export TP_NUM=1
53+
fi
54+
export MAX_BATCHED_TOKENS=2048
55+
export MAX_SEQS=256
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright (C) 2025 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
vllm:
5+
image:
6+
repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
7+
tag: "v0.10.0"
8+
resources: {}
9+
LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
10+
# Uncomment the following model specific settings for DeepSeek models
11+
VLLM_CPU_KVCACHE_SPACE: 40
12+
VLLM_CPU_SGK_KERNEL: 1
13+
14+
extraCmdArgs: [
15+
"--tensor-parallel-size", "1",
16+
"--pipeline-parallel-size", "1",
17+
"--block-size", "128",
18+
"--dtype", "bfloat16",
19+
"--max-model-len", "5196",
20+
"--distributed_executor_backend", "mp",
21+
"--max-num-batched-tokens", "2048",
22+
"--max-num-seqs", "256",
23+
"--enforce-eager"]
24+
#resources:
25+
# requests:
26+
# memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models

0 commit comments

Comments
 (0)