Enable Xeon optimizations like Tensor Parallel and AMX from vLLM 0.10.0 (#2106)

louie-tsai · web-flow · commit 3e90eabfcf44 · 2025-08-18T15:37:18.000-07:00
Signed-off-by: Tsai, Louie &lt;louie.tsai@intel.com&gt;
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -73,6 +73,13 @@ CPU example with Open Telemetry feature:
 docker compose -f compose.yaml -f compose.telemetry.yaml up -d
 ```
 
+To enable Xeon Optimization like AMX or Tensor Parallel for vLLM, compose.perf.yaml file need to be merged along with default compose.yaml file.  
+CPU example with optimized vLLM feature:
+
+```bash
+docker compose -f compose.yaml -f compose.perf.yaml up -d
+```
+
 **Note**: developers should build docker image from source when:
 
 - Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.perf.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.perf.yaml
@@ -0,0 +1,27 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  vllm-service:
+    image: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.10.0
+    environment:
+      VLLM_CPU_SGL_KERNEL: 1
+    entrypoint: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --dtype bfloat16 --distributed-executor-backend mp --block-size 128 --enforce-eager --tensor-parallel-size $TP_NUM --pipeline-parallel-size $PP_NUM --max-num-batched-tokens $MAX_BATCHED_TOKENS --max-num-seqs $MAX_SEQS
+  vllm-ci-test:
+    image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f-cpu
+    container_name: vllm-ci-test
+    volumes:
+      - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HF_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_CPU_KVCACHE_SPACE: 40
+      ON_CPU: 1
+      REMOTE_HOST: vllm-service
+      REMOTE_PORT: 80
+    entrypoint: tail -f /dev/null
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -37,3 +37,19 @@ export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
 pushd "grafana/dashboards" > /dev/null
 source download_opea_dashboard.sh
 popd > /dev/null
+declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
+echo $numa_count
+if (( numa_count % 2 == 0 )); then
+    if (( numa_count == 6 )); then
+        export TP_NUM=2
+        export PP_NUM=3
+    else
+        export TP_NUM=$numa_count
+	export PP_NUM=1
+    fi
+else
+    export PP_NUM=$numa_count
+    export TP_NUM=1
+fi
+export MAX_BATCHED_TOKENS=2048
+export MAX_SEQS=256
diff --git a/ChatQnA/kubernetes/helm/cpu-values-perf.yaml b/ChatQnA/kubernetes/helm/cpu-values-perf.yaml
@@ -0,0 +1,26 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+vllm:
+  image:
+    repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+    tag: "v0.10.0"
+  resources: {}
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  # Uncomment the following model specific settings for DeepSeek models
+  VLLM_CPU_KVCACHE_SPACE: 40
+  VLLM_CPU_SGK_KERNEL: 1
+
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--pipeline-parallel-size", "1",
+    "--block-size", "128",
+    "--dtype", "bfloat16",
+    "--max-model-len", "5196",
+    "--distributed_executor_backend", "mp",
+    "--max-num-batched-tokens", "2048",
+    "--max-num-seqs", "256",
+    "--enforce-eager"]
+  #resources:
+  #  requests:
+  #    memory: 60Gi # 40G for KV cache, and 20G for DeepSeek-R1-Distill-Qwen-7B, need to adjust it for other models