Merge branch 'main' into vrdn-23/fix-gelu-activation

alvarobartt · web-flow · commit c8b5348c9f60 · 2026-01-22T02:15:11.000+09:00
diff --git a/README.md b/README.md
@@ -137,14 +137,15 @@ To see all options to serve your models:
 $ text-embeddings-router --help
 Text Embedding Webserver
 
-Usage: text-embeddings-router [OPTIONS]
+Usage: text-embeddings-router [OPTIONS] --model-id <MODEL_ID>
 
 Options:
       --model-id <MODEL_ID>
-          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `BAAI/bge-large-en-v1.5`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          The Hugging Face model ID, can be any model listed on <https://huggingface.co/models> with the `text-embeddings-inference` tag (meaning it's compatible with Text Embeddings Inference).
+
+          Alternatively, the specified ID can also be a path to a local directory containing the necessary model files saved by the `save_pretrained(...)` methods of either Transformers or Sentence Transformers.
 
           [env: MODEL_ID=]
-          [default: BAAI/bge-large-en-v1.5]
 
       --revision <REVISION>
           The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
@@ -162,6 +163,11 @@ Options:
           [env: DTYPE=]
           [possible values: float16, float32]
 
+      --served-model-name <SERVED_MODEL_NAME>
+          The name of the model that is being served. If not specified, defaults to `--model-id`. It is only used for the OpenAI-compatible endpoints via HTTP
+
+          [env: SERVED_MODEL_NAME=]
+
       --pooling <POOLING>
           Optionally control the pooling method for embedding models.
 
@@ -238,10 +244,9 @@ Options:
 
           Some embedding models require an extra `Dense` module which contains a single Linear layer and an activation function. By default, those `Dense` modules are stored under the `2_Dense` directory, but there might be cases where different `Dense` modules are provided, to convert the pooled embeddings into different dimensions, available as `2_Dense_<dims>` e.g. https://huggingface.co/NovaSearch/stella_en_400M_v5.
 
-          Note that this argument is optional, only required to be set if the path to the `Dense` module is other than `2_Dense`. And it also applies when leveraging the `candle` backend.
+          Note that this argument is optional, only required to be set if there is no `modules.json` file or when you want to override a single Dense module path, only when running with the `candle` backend.
 
           [env: DENSE_PATH=]
-          [default: 2_Dense]
 
       --hf-token <HF_TOKEN>
           Your Hugging Face Hub token
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -1215,6 +1215,7 @@
         "required": [
           "model_id",
           "model_dtype",
+          "served_model_name",
           "model_type",
           "max_concurrent_requests",
           "max_input_length",
@@ -1278,6 +1279,10 @@
           "model_type": {
             "$ref": "#/components/schemas/ModelType"
           },
+          "served_model_name": {
+            "type": "string",
+            "example": "thenlper/gte-base"
+          },
           "sha": {
             "type": "string",
             "example": "null",
diff --git a/docs/source/en/cli_arguments.md b/docs/source/en/cli_arguments.md
@@ -22,14 +22,15 @@ To see all options to serve your models, run the following:
 $ text-embeddings-router --help
 Text Embedding Webserver
 
-Usage: text-embeddings-router [OPTIONS]
+Usage: text-embeddings-router [OPTIONS] --model-id <MODEL_ID>
 
 Options:
       --model-id <MODEL_ID>
-          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `BAAI/bge-large-en-v1.5`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          The Hugging Face model ID, can be any model listed on <https://huggingface.co/models> with the `text-embeddings-inference` tag (meaning it's compatible with Text Embeddings Inference).
+
+          Alternatively, the specified ID can also be a path to a local directory containing the necessary model files saved by the `save_pretrained(...)` methods of either Transformers or Sentence Transformers.
 
           [env: MODEL_ID=]
-          [default: BAAI/bge-large-en-v1.5]
 
       --revision <REVISION>
           The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
@@ -47,6 +48,11 @@ Options:
           [env: DTYPE=]
           [possible values: float16, float32]
 
+      --served-model-name <SERVED_MODEL_NAME>
+          The name of the model that is being served. If not specified, defaults to `--model-id`. It is only used for the OpenAI-compatible endpoints via HTTP
+
+          [env: SERVED_MODEL_NAME=]
+
       --pooling <POOLING>
           Optionally control the pooling method for embedding models.
 
@@ -123,10 +129,9 @@ Options:
 
           Some embedding models require an extra `Dense` module which contains a single Linear layer and an activation function. By default, those `Dense` modules are stored under the `2_Dense` directory, but there might be cases where different `Dense` modules are provided, to convert the pooled embeddings into different dimensions, available as `2_Dense_<dims>` e.g. https://huggingface.co/NovaSearch/stella_en_400M_v5.
 
-          Note that this argument is optional, only required to be set if the path to the `Dense` module is other than `2_Dense`. And it also applies when leveraging the `candle` backend.
+          Note that this argument is optional, only required to be set if there is no `modules.json` file or when you want to override a single Dense module path, only when running with the `candle` backend.
 
           [env: DENSE_PATH=]
-          [default: 2_Dense]
 
       --hf-token <HF_TOKEN>
           Your Hugging Face Hub token
diff --git a/router/src/http/server.rs b/router/src/http/server.rs
@@ -1153,6 +1153,18 @@ async fn openai_embed(
         span.set_parent(context);
     }
 
+    // NOTE: Validation of `model` won't fail for the time being given that Text Embeddings
+    // Inference can only serve a single model at a time so no need for the `model` parameter to
+    // differentiate one model from the other, but we at least raise a warning.
+    if let Some(requested_model) = &req.model {
+        if requested_model != &info.served_model_name {
+            tracing::warn!(
+                "The provided `model={}` has not been found, the `model` parameter should be provided either empty or with `model={}` instead.",
+                requested_model, info.served_model_name
+            );
+        }
+    }
+
     let start_time = Instant::now();
 
     let truncate = info.auto_truncate;
@@ -1308,7 +1320,7 @@ async fn openai_embed(
     let response = OpenAICompatResponse {
         object: "list",
         data: embeddings,
-        model: info.model_id.clone(),
+        model: info.served_model_name.clone(),
         usage: OpenAICompatUsage {
             prompt_tokens: compute_tokens,
             total_tokens: compute_tokens,
diff --git a/router/src/lib.rs b/router/src/lib.rs
@@ -46,6 +46,7 @@ pub async fn run(
     revision: Option<String>,
     tokenization_workers: Option<usize>,
     dtype: Option<DType>,
+    served_model_name: String,
     pooling: Option<text_embeddings_backend::Pool>,
     max_concurrent_requests: usize,
     max_batch_tokens: usize,
@@ -323,6 +324,7 @@ pub async fn run(
         model_id,
         model_sha: revision,
         model_dtype: dtype.to_string(),
+        served_model_name,
         model_type,
         max_concurrent_requests,
         max_input_length,
@@ -539,6 +541,8 @@ pub struct Info {
     pub model_sha: Option<String>,
     #[cfg_attr(feature = "http", schema(example = "float16"))]
     pub model_dtype: String,
+    #[cfg_attr(feature = "http", schema(example = "thenlper/gte-base"))]
+    pub served_model_name: String,
     pub model_type: ModelType,
     /// Router Parameters
     #[cfg_attr(feature = "http", schema(example = "128"))]
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -14,7 +14,7 @@ static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
 struct Args {
     /// The Hugging Face model ID, can be any model listed on <https://huggingface.co/models> with
     /// the `text-embeddings-inference` tag (meaning it's compatible with Text Embeddings
-    /// Inference)
+    /// Inference).
     ///
     /// Alternatively, the specified ID can also be a path to a local directory containing the
     /// necessary model files saved by the `save_pretrained(...)` methods of either Transformers or
@@ -38,6 +38,11 @@ struct Args {
     #[clap(long, env, value_enum)]
     dtype: Option<DType>,
 
+    /// The name of the model that is being served. If not specified, defaults to `--model-id`. It
+    /// is only used for the OpenAI-compatible endpoints via HTTP.
+    #[clap(long, env)]
+    served_model_name: Option<String>,
+
     /// Optionally control the pooling method for embedding models.
     ///
     /// If `pooling` is not set, the pooling configuration will be parsed from the
@@ -225,11 +230,16 @@ async fn main() -> Result<()> {
     }
     let token = args.hf_token.or(args.hf_api_token);
 
+    let served_model_name = args
+        .served_model_name
+        .unwrap_or_else(|| args.model_id.clone());
+
     text_embeddings_router::run(
         args.model_id,
         args.revision,
         args.tokenization_workers,
         args.dtype,
+        served_model_name,
         args.pooling,
         args.max_concurrent_requests,
         args.max_batch_tokens,
diff --git a/router/tests/common.rs b/router/tests/common.rs
@@ -46,10 +46,11 @@ async fn check_health(port: u16, timeout: Duration) -> Result<()> {
 pub async fn start_server(model_id: String, revision: Option<String>, dtype: DType) -> Result<()> {
     let server_task = tokio::spawn({
         run(
-            model_id,
+            model_id.clone(),
             revision,
             Some(1),
             Some(dtype),
+            model_id,
             None,
             4,
             1024,