[NeuralChat] Configure TGI endpoint from YAML (intel#1321)

letonghan · web-flow · commit 525ea8652025 · 2024-02-29T15:33:38.000+08:00
* update tgi endpoint

Signed-off-by: LetongHan &lt;letong.han@intel.com&gt;
diff --git a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md
@@ -63,6 +63,8 @@ You can customize the configuration file 'tgi.yaml' to match your environment se
 | model_name_or_path                | "./neural-chat-7b-v3-1"                 |
 | device                            | "cpu"/"gpu"/"hpu"                                 |
 | serving.framework                  | "tgi"                                   |
+| serving.framework.tgi_engine_params.endpoint        | Your existed tgi service endpoint. when endpoint is set, neuralchat will not start a tgi service, and other params will not work any more.                |
+| serving.framework.tgi_engine_params.port        | 9876, the port that neuralchat will help to start tgi service.                    |
 | serving.framework.tgi_engine_params.sharded        | true (false only on cpu)                    |
 | serving.framework.tgi_engine_params.num_shard  | 4 (not effective when sharded is false)    |
 | serving.framework.tgi_engine_params.habana_visible_devices      | "0,1" (only on hpu)        |
@@ -90,3 +92,8 @@ curl ${your_ip}:${your_port}/v1/tgi/generate \
 ```
 
 Of course, you can also consume the service via `postman`, `http request`, or other ways.
+
+If neuralchat is unable to call your local tgi service, try the command below then try again.
+```bash
+unset http_proxy
+```
diff --git a/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml b/intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml
@@ -29,6 +29,10 @@ device: "auto"
 serving:
     framework: "tgi"
     tgi_engine_params:
+        # when endpoint is set, neuralchat will not start a tgi service,
+        # and other params will not work
+        endpoint: "http://0.0.0.0:9876/"
+        port: "9876"
         # not supported on CPU
         sharded: true
         num_shard: 4
diff --git a/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py b/intel_extension_for_transformers/neural_chat/server/neuralchat_server.py
@@ -132,53 +132,59 @@ def init(self, config):
             # TGI serving
             elif serving_framework == "tgi":
                 tgi_params = serving.get("tgi_engine_params", None)
-                tgi_sharded = tgi_params.get('sharded', False)
-                tgi_num_shard = tgi_params.get('num_shard', 1)
-                tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all")
-                # construct tgi command
-                tgi_cmd = "docker run -p 9876:80 --name tgi_service -v ./data:/data"
-                if device == "cpu":
-                    tgi_cmd += " --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
-                    # sharded is not supported on CPU
-                    if tgi_sharded:
-                        tgi_sharded = False
-                elif device == "gpu":
-                    tgi_cmd += " --gpus all --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
-                    pass
-                elif device == "hpu":
-                    create_docker_cmd = f"git clone https://github.com/huggingface/tgi-gaudi.git && \
-                        cd tgi-gaudi && docker build -t tgi_gaudi ."
+                tgi_endpoint = tgi_params.get('endpoint', None)
+                if tgi_endpoint:
+                    logger.info(f"tgi endpoint already exist: {tgi_endpoint}")
+                # start a tgi service
+                else:
+                    tgi_port = tgi_params.get('port', "9876")
+                    tgi_sharded = tgi_params.get('sharded', False)
+                    tgi_num_shard = tgi_params.get('num_shard', 1)
+                    tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all")
+                    # construct tgi command
+                    tgi_cmd = f"docker run -p {tgi_port}:80 --name tgi_service -v ./data:/data"
+                    if device == "cpu":
+                        tgi_cmd += " --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
+                        # sharded is not supported on CPU
+                        if tgi_sharded:
+                            tgi_sharded = False
+                    elif device == "gpu":
+                        tgi_cmd += " --gpus all --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
+                        pass
+                    elif device == "hpu":
+                        create_docker_cmd = f"git clone https://github.com/huggingface/tgi-gaudi.git && \
+                            cd tgi-gaudi && docker build -t tgi_gaudi ."
+                        try:
+                            # create docker image first
+                            logger.info(f"<neuralchat_server> create docker command = {create_docker_cmd}")
+                            sys.stdout.flush()
+                            sys.stderr.flush()
+                            subprocess.Popen(create_docker_cmd, shell=True, executable="/bin/bash")   # nosec
+                            logger.info("creating tgi habana docker image...")
+                            time.sleep(200)
+                        except Exception as e:
+                            raise RuntimeError(f"Error in tgi habana docker image creation: {e}")
+                        # add tgi_cmd
+                        if tgi_sharded and tgi_num_shard > 1:
+                            tgi_cmd += "-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true"
+                        tgi_cmd += f"--runtime=habana -e HABANA_VISIBLE_DEVICES={tgi_habana_visible_devices} \
+                            -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host tgi_gaudi"
+                    else:
+                        logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}")
+                        raise Exception("Please specify device for tgi.")
+                    tgi_cmd += f" --model-id {model_name_or_path}"
+                    if tgi_sharded and tgi_num_shard > 1:
+                        tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
+                    # start tgi service
                     try:
-                        # create docker image first
-                        logger.info(f"<neuralchat_server> create docker command = {create_docker_cmd}")
+                        logger.info(f"<neuralchat_server> Run docker. cmd: {tgi_cmd}")
                         sys.stdout.flush()
                         sys.stderr.flush()
-                        subprocess.Popen(create_docker_cmd, shell=True, executable="/bin/bash")   # nosec
-                        logger.info("creating tgi habana docker image...")
+                        subprocess.Popen(tgi_cmd, shell=True, executable="/bin/bash")   # nosec
+                        logger.info("Building docker container...")
                         time.sleep(200)
                     except Exception as e:
-                        raise RuntimeError(f"Error in tgi habana docker image creation: {e}")
-                    # add tgi_cmd
-                    if tgi_sharded and tgi_num_shard > 1:
-                        tgi_cmd += "-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true"
-                    tgi_cmd += f"--runtime=habana -e HABANA_VISIBLE_DEVICES={tgi_habana_visible_devices} \
-                        -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host tgi_gaudi"
-                else:
-                    logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}")
-                    raise Exception("Please specify device for tgi.")
-                tgi_cmd += f" --model-id {model_name_or_path}"
-                if tgi_sharded and tgi_num_shard > 1:
-                    tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
-                # start tgi service
-                try:
-                    logger.info(f"<neuralchat_server> Run docker. cmd: {tgi_cmd}")
-                    sys.stdout.flush()
-                    sys.stderr.flush()
-                    subprocess.Popen(tgi_cmd, shell=True, executable="/bin/bash")   # nosec
-                    logger.info("Building docker container...")
-                    time.sleep(200)
-                except Exception as e:
-                    raise RuntimeError(f"Error when building docker container: {e}")
+                        raise RuntimeError(f"Error when building docker container: {e}")
 
         # plugin as service
         if plugin_as_service:
@@ -317,7 +323,14 @@ def init(self, config):
                 self.chatbot = build_chatbot(pipeline_config)
             # init api
             from .restful.api import setup_router
-            api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port)
+            if serving and serving.get("framework") == "tgi":
+                if tgi_endpoint:
+                    endpoint = tgi_endpoint
+                else:
+                    endpoint = f"http://0.0.0.0:{tgi_port}/"
+                api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port, endpoint)
+            else:
+                api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port)
             app.include_router(api_router)
             return True
 
diff --git a/intel_extension_for_transformers/neural_chat/server/restful/api.py b/intel_extension_for_transformers/neural_chat/server/restful/api.py
@@ -50,7 +50,16 @@
     'tgi': tgi_router
 }
 
-def setup_router(api_list, chatbot=None, enable_llm=True, use_deepspeed=False, world_size=1, host="0.0.0.0", port=80):
+def setup_router(
+        api_list,
+        chatbot=None,
+        enable_llm=True,
+        use_deepspeed=False,
+        world_size=1,
+        host="0.0.0.0",
+        port=80,
+        endpoint=None
+    ):
     """Setup router for FastAPI
 
     Args:
@@ -69,6 +78,9 @@ def setup_router(api_list, chatbot=None, enable_llm=True, use_deepspeed=False, w
             if lower_api_name == "plugin_image2image":
                 api_router.worker.start()
                 logger.info("create main worker done...")
+            if endpoint and lower_api_name=="tgi":
+                api_router.set_tgi_endpoint(endpoint)
+                logger.info(f"set tgi endpoint: {endpoint}")
             _router.include_router(api_router)
         else:
             logger.error(f"NeuralChat has not supported such service yet: {api_name}")
diff --git a/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py b/intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py
@@ -28,9 +28,11 @@ class TextGenerationAPIRouter(APIRouter):
 
     def __init__(self) -> None:
         super().__init__()
-        self.endpoint = "http://0.0.0.0:9876/"
         self.chatbot = None
 
+    def set_tgi_endpoint(self, endpoint):
+        self.endpoint = endpoint
+
     def set_chatbot(self, chatbot, use_deepspeed, world_size, host, port) -> None:
         self.chatbot = chatbot
         self.use_deepspeed = use_deepspeed