@@ -132,53 +132,59 @@ def init(self, config):
132
132
# TGI serving
133
133
elif serving_framework == "tgi" :
134
134
tgi_params = serving .get ("tgi_engine_params" , None )
135
- tgi_sharded = tgi_params .get ('sharded' , False )
136
- tgi_num_shard = tgi_params .get ('num_shard' , 1 )
137
- tgi_habana_visible_devices = tgi_params .get ('habana_visible_devices' , "all" )
138
- # construct tgi command
139
- tgi_cmd = "docker run -p 9876:80 --name tgi_service -v ./data:/data"
140
- if device == "cpu" :
141
- tgi_cmd += " --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
142
- # sharded is not supported on CPU
143
- if tgi_sharded :
144
- tgi_sharded = False
145
- elif device == "gpu" :
146
- tgi_cmd += " --gpus all --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
147
- pass
148
- elif device == "hpu" :
149
- create_docker_cmd = f"git clone https://github.com/huggingface/tgi-gaudi.git && \
150
- cd tgi-gaudi && docker build -t tgi_gaudi ."
135
+ tgi_endpoint = tgi_params .get ('endpoint' , None )
136
+ if tgi_endpoint :
137
+ logger .info (f"tgi endpoint already exist: { tgi_endpoint } " )
138
+ # start a tgi service
139
+ else :
140
+ tgi_port = tgi_params .get ('port' , "9876" )
141
+ tgi_sharded = tgi_params .get ('sharded' , False )
142
+ tgi_num_shard = tgi_params .get ('num_shard' , 1 )
143
+ tgi_habana_visible_devices = tgi_params .get ('habana_visible_devices' , "all" )
144
+ # construct tgi command
145
+ tgi_cmd = f"docker run -p { tgi_port } :80 --name tgi_service -v ./data:/data"
146
+ if device == "cpu" :
147
+ tgi_cmd += " --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
148
+ # sharded is not supported on CPU
149
+ if tgi_sharded :
150
+ tgi_sharded = False
151
+ elif device == "gpu" :
152
+ tgi_cmd += " --gpus all --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.3"
153
+ pass
154
+ elif device == "hpu" :
155
+ create_docker_cmd = f"git clone https://github.com/huggingface/tgi-gaudi.git && \
156
+ cd tgi-gaudi && docker build -t tgi_gaudi ."
157
+ try :
158
+ # create docker image first
159
+ logger .info (f"<neuralchat_server> create docker command = { create_docker_cmd } " )
160
+ sys .stdout .flush ()
161
+ sys .stderr .flush ()
162
+ subprocess .Popen (create_docker_cmd , shell = True , executable = "/bin/bash" ) # nosec
163
+ logger .info ("creating tgi habana docker image..." )
164
+ time .sleep (200 )
165
+ except Exception as e :
166
+ raise RuntimeError (f"Error in tgi habana docker image creation: { e } " )
167
+ # add tgi_cmd
168
+ if tgi_sharded and tgi_num_shard > 1 :
169
+ tgi_cmd += "-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true"
170
+ tgi_cmd += f"--runtime=habana -e HABANA_VISIBLE_DEVICES={ tgi_habana_visible_devices } \
171
+ -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host tgi_gaudi"
172
+ else :
173
+ logger .error (f"Supported device: [cpu, gpu, hpu]. Your device: { device } " )
174
+ raise Exception ("Please specify device for tgi." )
175
+ tgi_cmd += f" --model-id { model_name_or_path } "
176
+ if tgi_sharded and tgi_num_shard > 1 :
177
+ tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
178
+ # start tgi service
151
179
try :
152
- # create docker image first
153
- logger .info (f"<neuralchat_server> create docker command = { create_docker_cmd } " )
180
+ logger .info (f"<neuralchat_server> Run docker. cmd: { tgi_cmd } " )
154
181
sys .stdout .flush ()
155
182
sys .stderr .flush ()
156
- subprocess .Popen (create_docker_cmd , shell = True , executable = "/bin/bash" ) # nosec
157
- logger .info ("creating tgi habana docker image ..." )
183
+ subprocess .Popen (tgi_cmd , shell = True , executable = "/bin/bash" ) # nosec
184
+ logger .info ("Building docker container ..." )
158
185
time .sleep (200 )
159
186
except Exception as e :
160
- raise RuntimeError (f"Error in tgi habana docker image creation: { e } " )
161
- # add tgi_cmd
162
- if tgi_sharded and tgi_num_shard > 1 :
163
- tgi_cmd += "-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true"
164
- tgi_cmd += f"--runtime=habana -e HABANA_VISIBLE_DEVICES={ tgi_habana_visible_devices } \
165
- -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host tgi_gaudi"
166
- else :
167
- logger .error (f"Supported device: [cpu, gpu, hpu]. Your device: { device } " )
168
- raise Exception ("Please specify device for tgi." )
169
- tgi_cmd += f" --model-id { model_name_or_path } "
170
- if tgi_sharded and tgi_num_shard > 1 :
171
- tgi_cmd += " --sharded {tgi_sharded} --num-shard {tgi_num_shard}"
172
- # start tgi service
173
- try :
174
- logger .info (f"<neuralchat_server> Run docker. cmd: { tgi_cmd } " )
175
- sys .stdout .flush ()
176
- sys .stderr .flush ()
177
- subprocess .Popen (tgi_cmd , shell = True , executable = "/bin/bash" ) # nosec
178
- logger .info ("Building docker container..." )
179
- time .sleep (200 )
180
- except Exception as e :
181
- raise RuntimeError (f"Error when building docker container: { e } " )
187
+ raise RuntimeError (f"Error when building docker container: { e } " )
182
188
183
189
# plugin as service
184
190
if plugin_as_service :
@@ -317,7 +323,14 @@ def init(self, config):
317
323
self .chatbot = build_chatbot (pipeline_config )
318
324
# init api
319
325
from .restful .api import setup_router
320
- api_router = setup_router (api_list , self .chatbot , True , use_deepspeed , world_size , host , port )
326
+ if serving and serving .get ("framework" ) == "tgi" :
327
+ if tgi_endpoint :
328
+ endpoint = tgi_endpoint
329
+ else :
330
+ endpoint = f"http://0.0.0.0:{ tgi_port } /"
331
+ api_router = setup_router (api_list , self .chatbot , True , use_deepspeed , world_size , host , port , endpoint )
332
+ else :
333
+ api_router = setup_router (api_list , self .chatbot , True , use_deepspeed , world_size , host , port )
321
334
app .include_router (api_router )
322
335
return True
323
336
0 commit comments