File tree Expand file tree Collapse file tree 4 files changed +76
-0
lines changed
docker_compose/intel/cpu/xeon Expand file tree Collapse file tree 4 files changed +76
-0
lines changed Original file line number Diff line number Diff line change @@ -73,6 +73,13 @@ CPU example with Open Telemetry feature:
7373docker compose -f compose.yaml -f compose.telemetry.yaml up -d
7474```
7575
76+ To enable Xeon Optimization like AMX or Tensor Parallel for vLLM, compose.perf.yaml file need to be merged along with default compose.yaml file.
77+ CPU example with optimized vLLM feature:
78+
79+ ``` bash
80+ docker compose -f compose.yaml -f compose.perf.yaml up -d
81+ ```
82+
7683** Note** : developers should build docker image from source when:
7784
7885- Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
Original file line number Diff line number Diff line change 1+ # Copyright (C) 2024 Intel Corporation
2+ # SPDX-License-Identifier: Apache-2.0
3+
4+ services :
5+ vllm-service :
6+ image : public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.10.0
7+ environment :
8+ VLLM_CPU_SGL_KERNEL : 1
9+ entrypoint : ["python3", "-m", "vllm.entrypoints.openai.api_server"]
10+ command : --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --dtype bfloat16 --distributed-executor-backend mp --block-size 128 --enforce-eager --tensor-parallel-size $TP_NUM --pipeline-parallel-size $PP_NUM --max-num-batched-tokens $MAX_BATCHED_TOKENS --max-num-seqs $MAX_SEQS
11+ vllm-ci-test :
12+ image : public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f-cpu
13+ container_name : vllm-ci-test
14+ volumes :
15+ - " ${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
16+ shm_size : 128g
17+ environment :
18+ no_proxy : ${no_proxy}
19+ http_proxy : ${http_proxy}
20+ https_proxy : ${https_proxy}
21+ HF_TOKEN : ${HF_TOKEN}
22+ LLM_MODEL_ID : ${LLM_MODEL_ID}
23+ VLLM_CPU_KVCACHE_SPACE : 40
24+ ON_CPU : 1
25+ REMOTE_HOST : vllm-service
26+ REMOTE_PORT : 80
27+ entrypoint : tail -f /dev/null
Original file line number Diff line number Diff line change @@ -37,3 +37,19 @@ export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
3737pushd " grafana/dashboards" > /dev/null
3838source download_opea_dashboard.sh
3939popd > /dev/null
40+ declare -g numa_count=$( lscpu | grep " NUMA node(s):" | awk ' {print $3}' )
41+ echo $numa_count
42+ if (( numa_count % 2 == 0 )) ; then
43+ if (( numa_count == 6 )) ; then
44+ export TP_NUM=2
45+ export PP_NUM=3
46+ else
47+ export TP_NUM=$numa_count
48+ export PP_NUM=1
49+ fi
50+ else
51+ export PP_NUM=$numa_count
52+ export TP_NUM=1
53+ fi
54+ export MAX_BATCHED_TOKENS=2048
55+ export MAX_SEQS=256
Original file line number Diff line number Diff line change 1+ # Copyright (C) 2025 Intel Corporation
2+ # SPDX-License-Identifier: Apache-2.0
3+
4+ vllm :
5+ image :
6+ repository : public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
7+ tag : " v0.10.0"
8+ resources : {}
9+ LLM_MODEL_ID : meta-llama/Meta-Llama-3-8B-Instruct
10+ # Uncomment the following model specific settings for DeepSeek models
11+ VLLM_CPU_KVCACHE_SPACE : 40
12+ VLLM_CPU_SGK_KERNEL : 1
13+
14+ extraCmdArgs : [
15+ " --tensor-parallel-size" , "1",
16+ " --pipeline-parallel-size" , "1",
17+ " --block-size" , "128",
18+ " --dtype" , "bfloat16",
19+ " --max-model-len" , "5196",
20+ " --distributed_executor_backend" , "mp",
21+ " --max-num-batched-tokens" , "2048",
22+ " --max-num-seqs" , "256",
23+ " --enforce-eager" ]
24+ # resources:
25+ # requests:
26+ # memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models
You can’t perform that action at this time.
0 commit comments