Support remote endpoints (#2085)

srinarayan-srikanthan · pre-commit-ci[bot] · web-flow · commit 7fdf05f3f0e3 · 2025-08-05T10:21:26.000+08:00
Signed-off-by: Ubuntu &lt;azureuser@denvr-inf.kifxisxbiwme5gt4kkwqsfdjuh.dx.internal.cloudapp.net&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
@@ -175,25 +175,23 @@ def align_generator(self, gen, **kwargs):
     # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
     for line in gen:
         line = line.decode("utf-8")
-        start = line.find("{")
-        end = line.rfind("}") + 1
-
-        json_str = line[start:end]
-        try:
-            # sometimes yield empty chunk, do a fallback here
-            json_data = json.loads(json_str)
-            if "ops" in json_data and "op" in json_data["ops"][0]:
-                if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str):
-                    yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n"
-                else:
-                    pass
-            elif (
-                json_data["choices"][0]["finish_reason"] != "eos_token"
-                and "content" in json_data["choices"][0]["delta"]
-            ):
-                yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
-        except Exception as e:
-            yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
+        chunks = [chunk.strip() for chunk in line.split("\n\n") if chunk.strip()]
+        for line in chunks:
+            start = line.find("{")
+            end = line.rfind("}") + 1
+            json_str = line[start:end]
+            try:
+                # sometimes yield empty chunk, do a fallback here
+                json_data = json.loads(json_str)
+                if "ops" in json_data and "op" in json_data["ops"][0]:
+                    if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str):
+                        yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n"
+                    else:
+                        pass
+                elif "content" in json_data["choices"][0]["delta"]:
+                    yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
+            except Exception as e:
+                yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
     yield "data: [DONE]\n\n"
 
 
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -147,6 +147,7 @@ In the context of deploying a ChatQnA pipeline on an Intel® Xeon® platform, we
 | File                                                         | Description                                                                                                                                                           |
 | ------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [compose.yaml](./compose.yaml)                               | Default compose file using vllm as serving framework and redis as vector database                                                                                     |
+| [compose_remote.yaml](./compose_remote.yaml)                 | Default compose file using remote inference endpoints and redis as vector database                                                                                    |
 | [compose_milvus.yaml](./compose_milvus.yaml)                 | Uses Milvus as the vector database. All other configurations remain the same as the default                                                                           |
 | [compose_pinecone.yaml](./compose_pinecone.yaml)             | Uses Pinecone as the vector database. All other configurations remain the same as the default. For more details, refer to [README_pinecone.md](./README_pinecone.md). |
 | [compose_qdrant.yaml](./compose_qdrant.yaml)                 | Uses Qdrant as the vector database. All other configurations remain the same as the default. For more details, refer to [README_qdrant.md](./README_qdrant.md).       |
@@ -158,6 +159,28 @@ In the context of deploying a ChatQnA pipeline on an Intel® Xeon® platform, we
 | [compose_tgi.telemetry.yaml](./compose_tgi.telemetry.yaml)   | Helper file for telemetry features for tgi. Can be used along with any compose files that serves tgi                                                                  |
 | [compose_mariadb.yaml](./compose_mariadb.yaml)               | Uses MariaDB Server as the vector database. All other configurations remain the same as the default                                                                   |
 
+### Running LLM models with remote endpoints
+
+When models are deployed on a remote server, a base URL and an API key are required to access them. To set up a remote server and acquire the base URL and API key, refer to [Intel® AI for Enterprise Inference](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/enterprise-inference.html) offerings.
+
+Set the following environment variables.
+
+- `REMOTE_ENDPOINT` is the HTTPS endpoint of the remote server with the model of choice (i.e. https://api.example.com). **Note:** If the API for the models does not use LiteLLM, the second part of the model card needs to be appended to the URL. For example, set `REMOTE_ENDPOINT` to https://api.example.com/Llama-3.3-70B-Instruct if the model card is `meta-llama/Llama-3.3-70B-Instruct`.
+- `API_KEY` is the access token or key to access the model(s) on the server.
+- `LLM_MODEL_ID` is the model card which may need to be overwritten depending on what it is set to `set_env.sh`.
+
+```bash
+export REMOTE_ENDPOINT=<https-endpoint-of-remote-server>
+export API_KEY=<your-api-key>
+export LLM_MODEL_ID=<model-card>
+```
+
+After setting these environment variables, run `docker compose` with `compose_remote.yaml`:
+
+```bash
+docker compose -f compose_remote.yaml up -d
+```
+
 ## ChatQnA with Conversational UI (Optional)
 
 To access the Conversational UI (react based) frontend, modify the UI service in the `compose` file used to deploy. Replace `chaqna-xeon-ui-server` service with the `chatqna-xeon-conversation-ui-server` service as per the config below:
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_remote.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_remote.yaml
@@ -102,7 +102,7 @@ services:
       - RERANK_SERVER_HOST_IP=tei-reranking-service
       - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
       - LLM_SERVER_HOST_IP=${REMOTE_ENDPOINT}
-      - OPENAI_API_KEY= ${OPENAI_API_KEY}
+      - OPENAI_API_KEY=${API_KEY}
       - LLM_SERVER_PORT=80
       - LLM_MODEL=${LLM_MODEL_ID}
       - LOGFLAG=${LOGFLAG}
diff --git a/CodeGen/codegen.py b/CodeGen/codegen.py
@@ -181,7 +181,6 @@ async def handle_request(self, request: Request):
 
         # Handle the chat messages to generate the prompt
         prompt = handle_message(chat_request.messages)
-
         # Get the agents flag from the request data, default to False if not provided
         agents_flag = data.get("agents_flag", False)
 
@@ -200,7 +199,6 @@ async def handle_request(self, request: Request):
 
         # Initialize the initial inputs with the generated prompt
         initial_inputs = {"query": prompt}
-
         # Check if the key index name is provided in the parameters
         if parameters.index_name:
             if agents_flag:
@@ -268,7 +266,6 @@ async def handle_request(self, request: Request):
         result_dict, runtime_graph = await megaservice.schedule(
             initial_inputs=initial_inputs, llm_parameters=parameters
         )
-
         for node, response in result_dict.items():
             # Check if the last microservice in the megaservice is LLM
             if (
@@ -277,7 +274,6 @@ async def handle_request(self, request: Request):
                 and megaservice.services[node].service_type == ServiceType.LLM
             ):
                 return response
-
         # Get the response from the last node in the runtime graph
         last_node = runtime_graph.all_leaves()[-1]
 
@@ -288,7 +284,6 @@ async def handle_request(self, request: Request):
                 response = result_dict[last_node]["text"]
             except (KeyError, TypeError):
                 response = "Response Error"
-
         choices = []
         usage = UsageInfo()
         choices.append(
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/README.md b/CodeGen/docker_compose/intel/cpu/xeon/README.md
@@ -91,11 +91,39 @@ Different Docker Compose files are available to select the LLM serving backend.
 - **Description:** Uses Hugging Face Text Generation Inference (TGI) optimized for Intel CPUs as the LLM serving engine.
 - **Services Deployed:** `codegen-tgi-server`, `codegen-llm-server`, `codegen-tei-embedding-server`, `codegen-retriever-server`, `redis-vector-db`, `codegen-dataprep-server`, `codegen-backend-server`, `codegen-gradio-ui-server`.
 - **To Run:**
+
   ```bash
   # Ensure environment variables (HOST_IP, HF_TOKEN) are set
   docker compose -f compose_tgi.yaml up -d
   ```
 
+  #### Deployment with remote endpoints (`compose_remote.yaml`)
+
+- **Compose File:** `compose_remote.yaml`
+- **Description:** Uses remote endpoints to access the served LLM's. This is the default configurations except for the LLM serving engine.
+- **Services Deployed:** `codegen-tei-embedding-server`, `codegen-retriever-server`, `redis-vector-db`, `codegen-dataprep-server`, `codegen-backend-server`, `codegen-gradio-ui-server`.
+- **To Run:**
+
+When models are deployed on a remote server, a base URL and an API key are required to access them. To set up a remote server and acquire the base URL and API key, refer to [Intel® AI for Enterprise Inference](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/enterprise-inference.html) offerings.
+
+Set the following environment variables.
+
+- `REMOTE_ENDPOINT` is the HTTPS endpoint of the remote server with the model of choice (i.e. https://api.example.com). **Note:** If the API for the models does not use LiteLLM, the second part of the model card needs to be appended to the URL. For example, set `REMOTE_ENDPOINT` to https://api.example.com/Llama-3.3-70B-Instruct if the model card is `meta-llama/Llama-3.3-70B-Instruct`.
+- `API_KEY` is the access token or key to access the model(s) on the server.
+- `LLM_MODEL_ID` is the model card which may need to be overwritten depending on what it is set to `set_env.sh`.
+
+```bash
+export REMOTE_ENDPOINT=<https-endpoint-of-remote-server>
+export API_KEY=<your-api-key>
+export LLM_MODEL_ID=<model-card>
+```
+
+After setting these environment variables, run `docker compose` with `compose_remote.yaml`:
+
+```bash
+docker compose -f compose_remote.yaml up -d
+```
+
 ### Configuration Parameters
 
 #### Environment Variables
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/compose_remote.yaml b/CodeGen/docker_compose/intel/cpu/xeon/compose_remote.yaml
@@ -6,6 +6,9 @@ services:
   codegen-xeon-backend-server:
     image: ${REGISTRY:-opea}/codegen:${TAG:-latest}
     container_name: codegen-xeon-backend-server
+    depends_on:
+      dataprep-redis-server:
+        condition: service_healthy
     ports:
       - "7778:7778"
     environment:
@@ -14,7 +17,8 @@ services:
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
       - LLM_SERVICE_HOST_IP=${REMOTE_ENDPOINT}
-      - OPENAI_API_KEY= ${OPENAI_API_KEY}
+      - LLM_MODEL_ID=${LLM_MODEL_ID}
+      - OPENAI_API_KEY=${API_KEY}
       - RETRIEVAL_SERVICE_HOST_IP=${RETRIEVAL_SERVICE_HOST_IP}
       - REDIS_RETRIEVER_PORT=${REDIS_RETRIEVER_PORT}
       - TEI_EMBEDDING_HOST_IP=${TEI_EMBEDDING_HOST_IP}
@@ -61,6 +65,11 @@ services:
       INDEX_NAME: ${INDEX_NAME}
       HF_TOKEN: ${HF_TOKEN}
       LOGFLAG: true
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
     restart: unless-stopped
   tei-embedding-serving:
     image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.7
diff --git a/DocSum/docker_compose/intel/cpu/xeon/README.md b/DocSum/docker_compose/intel/cpu/xeon/README.md
@@ -115,10 +115,33 @@ All the DocSum containers will be stopped and then removed on completion of the
 
 In the context of deploying a DocSum pipeline on an Intel® Xeon® platform, we can pick and choose different large language model serving frameworks. The table below outlines the various configurations that are available as part of the application.
 
-| File                                   | Description                                                                               |
-| -------------------------------------- | ----------------------------------------------------------------------------------------- |
-| [compose.yaml](./compose.yaml)         | Default compose file using vllm as serving framework                                      |
-| [compose_tgi.yaml](./compose_tgi.yaml) | The LLM serving framework is TGI. All other configurations remain the same as the default |
+| File                                         | Description                                                                            |
+| -------------------------------------------- | -------------------------------------------------------------------------------------- |
+| [compose.yaml](./compose.yaml)               | Default compose file using vllm as serving framework                                   |
+| [compose_tgi.yaml](./compose_tgi.yaml)       | The LLM serving framework is TGI. All other configurations remain the same as default  |
+| [compose_remote.yaml](./compose_remote.yaml) | Uses remote inference endpoints for LLMs. All other configurations are same as default |
+
+### Running LLM models with remote endpoints
+
+When models are deployed on a remote server, a base URL and an API key are required to access them. To set up a remote server and acquire the base URL and API key, refer to [Intel® AI for Enterprise Inference](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/enterprise-inference.html) offerings.
+
+Set the following environment variables.
+
+- `REMOTE_ENDPOINT` is the HTTPS endpoint of the remote server with the model of choice (i.e. https://api.example.com). **Note:** If the API for the models does not use LiteLLM, the second part of the model card needs to be appended to the URL. For example, set `REMOTE_ENDPOINT` to https://api.example.com/Llama-3.3-70B-Instruct if the model card is `meta-llama/Llama-3.3-70B-Instruct`.
+- `API_KEY` is the access token or key to access the model(s) on the server.
+- `LLM_MODEL_ID` is the model card which may need to be overwritten depending on what it is set to `set_env.sh`.
+
+```bash
+export REMOTE_ENDPOINT=<https-endpoint-of-remote-server>
+export API_KEY=<your-api-key>
+export LLM_MODEL_ID=<model-card>
+```
+
+After setting these environment variables, run `docker compose` with `compose_remote.yaml`:
+
+```bash
+docker compose -f compose_remote.yaml up -d
+```
 
 ## DocSum Detailed Usage
 
diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose_remote.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose_remote.yaml
@@ -0,0 +1,74 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  llm-docsum-vllm:
+    image: ${REGISTRY:-opea}/llm-docsum:${TAG:-latest}
+    container_name: docsum-xeon-llm-server
+    ports:
+      - ${LLM_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${REMOTE_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      OPENAI_API_KEY: ${API_KEY}
+      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+      HF_TOKEN: ${HF_TOKEN}
+      MAX_INPUT_TOKENS: ${MAX_INPUT_TOKENS}
+      MAX_TOTAL_TOKENS: ${MAX_TOTAL_TOKENS}
+      DocSum_COMPONENT_NAME: ${DocSum_COMPONENT_NAME}
+
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+  whisper:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: docsum-xeon-whisper-server
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+
+  docsum-xeon-backend-server:
+    image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+    container_name: docsum-xeon-backend-server
+    depends_on:
+      - llm-docsum-vllm
+    ports:
+      - "${BACKEND_SERVICE_PORT:-8888}:8888"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - ASR_SERVICE_HOST_IP=${ASR_SERVICE_HOST_IP}
+    ipc: host
+    restart: always
+
+  docsum-gradio-ui:
+    image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
+    container_name: docsum-xeon-ui-server
+    depends_on:
+      - docsum-xeon-backend-server
+    ports:
+      - "${FRONTEND_SERVICE_PORT:-5173}:5173"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+      - BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
+      - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
@@ -43,6 +43,7 @@ Some HuggingFace resources, such as some models, are only accessible if you have
 To set up environment variables for deploying Productivity Suite service, source the set_env.sh script in this directory:
 
 ```
+export host_ip=<ip-address-of-the-machine>
 source set_env.sh
 ```
 
@@ -228,3 +229,27 @@ The table provides a comprehensive overview of the Productivity Suite service ut
 | tgi_service_codegen                     | ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu | No       | Serves code generation models for inference, optimized for Intel Xeon CPUs.                                      |
 | tgi-service                             | ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu | No       | Specific to the TGI deployment, focuses on text generation inference using Xeon hardware.                        |
 | whisper-server                          | opea/whisper:latest                                           | No       | Provides speech-to-text transcription services using Whisper models.                                             |
+
+### Running LLM models with remote endpoints
+
+When models are deployed on a remote server, a base URL and an API key are required to access them. To set up a remote server and acquire the base URL and API key, refer to [Intel® AI for Enterprise Inference](https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/enterprise-inference.html) offerings.
+
+Set the following environment variables.
+
+- `REMOTE_ENDPOINT` is the HTTPS endpoint of the remote server with the model of choice (i.e. https://api.example.com). **Note:** If the API for the models does not use LiteLLM, the second part of the model card needs to be appended to the URL. For example, set `REMOTE_ENDPOINT` to https://api.example.com/Llama-3.3-70B-Instruct if the model card is `meta-llama/Llama-3.3-70B-Instruct`.
+- `API_KEY` is the access token or key to access the model(s) on the server.
+- `LLM_MODEL_ID` is the model card which may need to be overwritten depending on what it is set to `set_env.sh`.
+
+```bash
+export DocSum_COMPONENT_NAME="OpeaDocSumvLLM"
+export REMOTE_ENDPOINT=<https-endpoint-of-remote-server>
+export API_KEY=<your-api-key>
+export LLM_MODEL_ID=<model-card>
+export LLM_MODEL_ID_CODEGEN=<model-card>
+```
+
+After setting these environment variables, run `docker compose` with `compose_remote.yaml`:
+
+```bash
+docker compose -f compose_remote.yaml up -d
+```
diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/compose_remote.yaml b/ProductivitySuite/docker_compose/intel/cpu/xeon/compose_remote.yaml
diff --git a/ProductivitySuite/ui/react/src/components/PrimaryInput/PrimaryInput.tsx b/ProductivitySuite/ui/react/src/components/PrimaryInput/PrimaryInput.tsx
diff --git a/ProductivitySuite/ui/react/src/index.scss b/ProductivitySuite/ui/react/src/index.scss
diff --git a/ProductivitySuite/ui/react/src/redux/Conversation/ConversationSlice.ts b/ProductivitySuite/ui/react/src/redux/Conversation/ConversationSlice.ts