Skip to content

Commit 525ea86

Browse files
authored
[NeuralChat] Configure TGI endpoint from YAML (intel#1321)
* update tgi endpoint Signed-off-by: LetongHan <[email protected]>
1 parent ffa8f3c commit 525ea86

File tree

5 files changed

+83
-45
lines changed

5 files changed

+83
-45
lines changed

intel_extension_for_transformers/neural_chat/examples/serving/TGI/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ You can customize the configuration file 'tgi.yaml' to match your environment se
6363
| model_name_or_path | "./neural-chat-7b-v3-1" |
6464
| device | "cpu"/"gpu"/"hpu" |
6565
| serving.framework | "tgi" |
66+
| serving.framework.tgi_engine_params.endpoint | Your existed tgi service endpoint. when endpoint is set, neuralchat will not start a tgi service, and other params will not work any more. |
67+
| serving.framework.tgi_engine_params.port | 9876, the port that neuralchat will help to start tgi service. |
6668
| serving.framework.tgi_engine_params.sharded | true (false only on cpu) |
6769
| serving.framework.tgi_engine_params.num_shard | 4 (not effective when sharded is false) |
6870
| serving.framework.tgi_engine_params.habana_visible_devices | "0,1" (only on hpu) |
@@ -90,3 +92,8 @@ curl ${your_ip}:${your_port}/v1/tgi/generate \
9092
```
9193

9294
Of course, you can also consume the service via `postman`, `http request`, or other ways.
95+
96+
If neuralchat is unable to call your local tgi service, try the command below then try again.
97+
```bash
98+
unset http_proxy
99+
```

intel_extension_for_transformers/neural_chat/examples/serving/TGI/tgi.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ device: "auto"
2929
serving:
3030
framework: "tgi"
3131
tgi_engine_params:
32+
# when endpoint is set, neuralchat will not start a tgi service,
33+
# and other params will not work
34+
endpoint: "http://0.0.0.0:9876/"
35+
port: "9876"
3236
# not supported on CPU
3337
sharded: true
3438
num_shard: 4

intel_extension_for_transformers/neural_chat/server/neuralchat_server.py

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -132,53 +132,59 @@ def init(self, config):
132132
# TGI serving
133133
elif serving_framework == "tgi":
134134
tgi_params = serving.get("tgi_engine_params", None)
135-
tgi_sharded = tgi_params.get('sharded', False)
136-
tgi_num_shard = tgi_params.get('num_shard', 1)
137-
tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all")
138-
# construct tgi command
139-
tgi_cmd = "docker run -p 9876:80 --name tgi_service -v ./data:/data"
140-
if device == "cpu":
141-
tgi_cmd += " --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
142-
# sharded is not supported on CPU
143-
if tgi_sharded:
144-
tgi_sharded = False
145-
elif device == "gpu":
146-
tgi_cmd += " --gpus all --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
147-
pass
148-
elif device == "hpu":
149-
create_docker_cmd = f"git clone https://github.com/huggingface/tgi-gaudi.git && \
150-
cd tgi-gaudi && docker build -t tgi_gaudi ."
135+
tgi_endpoint = tgi_params.get('endpoint', None)
136+
if tgi_endpoint:
137+
logger.info(f"tgi endpoint already exist: {tgi_endpoint}")
138+
# start a tgi service
139+
else:
140+
tgi_port = tgi_params.get('port', "9876")
141+
tgi_sharded = tgi_params.get('sharded', False)
142+
tgi_num_shard = tgi_params.get('num_shard', 1)
143+
tgi_habana_visible_devices = tgi_params.get('habana_visible_devices', "all")
144+
# construct tgi command
145+
tgi_cmd = f"docker run -p {tgi_port}:80 --name tgi_service -v ./data:/data"
146+
if device == "cpu":
147+
tgi_cmd += " --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
148+
# sharded is not supported on CPU
149+
if tgi_sharded:
150+
tgi_sharded = False
151+
elif device == "gpu":
152+
tgi_cmd += " --gpus all --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
153+
pass
154+
elif device == "hpu":
155+
create_docker_cmd = f"git clone https://github.com/huggingface/tgi-gaudi.git && \
156+
cd tgi-gaudi && docker build -t tgi_gaudi ."
157+
try:
158+
# create docker image first
159+
logger.info(f"<neuralchat_server> create docker command = {create_docker_cmd}")
160+
sys.stdout.flush()
161+
sys.stderr.flush()
162+
subprocess.Popen(create_docker_cmd, shell=True, executable="/bin/bash") # nosec
163+
logger.info("creating tgi habana docker image...")
164+
time.sleep(200)
165+
except Exception as e:
166+
raise RuntimeError(f"Error in tgi habana docker image creation: {e}")
167+
# add tgi_cmd
168+
if tgi_sharded and tgi_num_shard > 1:
169+
tgi_cmd += "-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true"
170+
tgi_cmd += f"--runtime=habana -e HABANA_VISIBLE_DEVICES={tgi_habana_visible_devices} \
171+
-e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host tgi_gaudi"
172+
else:
173+
logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}")
174+
raise Exception("Please specify device for tgi.")
175+
tgi_cmd += f" --model-id {model_name_or_path}"
176+
if tgi_sharded and tgi_num_shard > 1:
177+
tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
178+
# start tgi service
151179
try:
152-
# create docker image first
153-
logger.info(f"<neuralchat_server> create docker command = {create_docker_cmd}")
180+
logger.info(f"<neuralchat_server> Run docker. cmd: {tgi_cmd}")
154181
sys.stdout.flush()
155182
sys.stderr.flush()
156-
subprocess.Popen(create_docker_cmd, shell=True, executable="/bin/bash") # nosec
157-
logger.info("creating tgi habana docker image...")
183+
subprocess.Popen(tgi_cmd, shell=True, executable="/bin/bash") # nosec
184+
logger.info("Building docker container...")
158185
time.sleep(200)
159186
except Exception as e:
160-
raise RuntimeError(f"Error in tgi habana docker image creation: {e}")
161-
# add tgi_cmd
162-
if tgi_sharded and tgi_num_shard > 1:
163-
tgi_cmd += "-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true"
164-
tgi_cmd += f"--runtime=habana -e HABANA_VISIBLE_DEVICES={tgi_habana_visible_devices} \
165-
-e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host tgi_gaudi"
166-
else:
167-
logger.error(f"Supported device: [cpu, gpu, hpu]. Your device: {device}")
168-
raise Exception("Please specify device for tgi.")
169-
tgi_cmd += f" --model-id {model_name_or_path}"
170-
if tgi_sharded and tgi_num_shard > 1:
171-
tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
172-
# start tgi service
173-
try:
174-
logger.info(f"<neuralchat_server> Run docker. cmd: {tgi_cmd}")
175-
sys.stdout.flush()
176-
sys.stderr.flush()
177-
subprocess.Popen(tgi_cmd, shell=True, executable="/bin/bash") # nosec
178-
logger.info("Building docker container...")
179-
time.sleep(200)
180-
except Exception as e:
181-
raise RuntimeError(f"Error when building docker container: {e}")
187+
raise RuntimeError(f"Error when building docker container: {e}")
182188

183189
# plugin as service
184190
if plugin_as_service:
@@ -317,7 +323,14 @@ def init(self, config):
317323
self.chatbot = build_chatbot(pipeline_config)
318324
# init api
319325
from .restful.api import setup_router
320-
api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port)
326+
if serving and serving.get("framework") == "tgi":
327+
if tgi_endpoint:
328+
endpoint = tgi_endpoint
329+
else:
330+
endpoint = f"http://0.0.0.0:{tgi_port}/"
331+
api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port, endpoint)
332+
else:
333+
api_router = setup_router(api_list, self.chatbot, True, use_deepspeed, world_size, host, port)
321334
app.include_router(api_router)
322335
return True
323336

intel_extension_for_transformers/neural_chat/server/restful/api.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,16 @@
5050
'tgi': tgi_router
5151
}
5252

53-
def setup_router(api_list, chatbot=None, enable_llm=True, use_deepspeed=False, world_size=1, host="0.0.0.0", port=80):
53+
def setup_router(
54+
api_list,
55+
chatbot=None,
56+
enable_llm=True,
57+
use_deepspeed=False,
58+
world_size=1,
59+
host="0.0.0.0",
60+
port=80,
61+
endpoint=None
62+
):
5463
"""Setup router for FastAPI
5564
5665
Args:
@@ -69,6 +78,9 @@ def setup_router(api_list, chatbot=None, enable_llm=True, use_deepspeed=False, w
6978
if lower_api_name == "plugin_image2image":
7079
api_router.worker.start()
7180
logger.info("create main worker done...")
81+
if endpoint and lower_api_name=="tgi":
82+
api_router.set_tgi_endpoint(endpoint)
83+
logger.info(f"set tgi endpoint: {endpoint}")
7284
_router.include_router(api_router)
7385
else:
7486
logger.error(f"NeuralChat has not supported such service yet: {api_name}")

intel_extension_for_transformers/neural_chat/server/restful/tgi_api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,11 @@ class TextGenerationAPIRouter(APIRouter):
2828

2929
def __init__(self) -> None:
3030
super().__init__()
31-
self.endpoint = "http://0.0.0.0:9876/"
3231
self.chatbot = None
3332

33+
def set_tgi_endpoint(self, endpoint):
34+
self.endpoint = endpoint
35+
3436
def set_chatbot(self, chatbot, use_deepspeed, world_size, host, port) -> None:
3537
self.chatbot = chatbot
3638
self.use_deepspeed = use_deepspeed

0 commit comments

Comments
 (0)