align vllm hpu version to latest vllm-fork (#1061)

XinyaoWa · web-flow · commit e9b164505ebd · 2024-11-07T14:08:56.000+08:00
Signed-off-by: Xinyao Wang &lt;xinyao.wang@intel.com&gt;
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml
@@ -86,7 +86,7 @@ services:
       MAX_WARMUP_SEQUENCE_LENGTH: 512
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
   vllm-service:
-    image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest}
+    image: ${REGISTRY:-opea}/vllm-hpu:${TAG:-latest}
     container_name: vllm-gaudi-server
     ports:
       - "8007:80"
@@ -104,7 +104,7 @@ services:
     cap_add:
       - SYS_NICE
     ipc: host
-    command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"
+    command: --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
diff --git a/ChatQnA/docker_image_build/build.yaml b/ChatQnA/docker_image_build/build.yaml
@@ -77,12 +77,6 @@ services:
       dockerfile: comps/llms/text-generation/vllm/langchain/Dockerfile
     extends: chatqna
     image: ${REGISTRY:-opea}/llm-vllm:${TAG:-latest}
-  llm-vllm-hpu:
-    build:
-      context: GenAIComps
-      dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu
-    extends: chatqna
-    image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest}
   llm-vllm-ray-hpu:
     build:
       context: GenAIComps
@@ -113,6 +107,12 @@ services:
       dockerfile: Dockerfile.cpu
     extends: chatqna
     image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
+  vllm-hpu:
+    build:
+      context: vllm-fork
+      dockerfile: Dockerfile.hpu
+    extends: chatqna
+    image: ${REGISTRY:-opea}/vllm-hpu:${TAG:-latest}
   nginx:
     build:
       context: GenAIComps
diff --git a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh
@@ -17,9 +17,10 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
+    git clone https://github.com/HabanaAI/vllm-fork.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis retriever-redis llm-vllm-hpu nginx"
+    service_list="chatqna chatqna-ui dataprep-redis retriever-redis vllm-hpu nginx"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
diff --git a/docker_images_list.md b/docker_images_list.md