huggingface · Narsil · Apr 29, 2025 · Apr 29, 2025
diff --git a/README.md b/README.md
@@ -129,31 +129,26 @@ NVIDIA drivers on your machine need to be compatible with CUDA version 12.2 or h
 
 To see all options to serve your models:
 
-```shell
-text-embeddings-router --help
-```
+```console
+$ text-embeddings-router --help
+Text Embedding Webserver
 
-```
 Usage: text-embeddings-router [OPTIONS]
 
 Options:
       --model-id <MODEL_ID>
-          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`.
-          Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of
-          transformers
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `BAAI/bge-large-en-v1.5`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
 
           [env: MODEL_ID=]
-          [default: thenlper/gte-base]
+          [default: BAAI/bge-large-en-v1.5]
 
       --revision <REVISION>
-          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id
-          or a branch like `refs/pr/2`
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
 
           [env: REVISION=]
 
       --tokenization-workers <TOKENIZATION_WORKERS>
-          Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation.
-          Default to the number of CPU cores on the machine
+          Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation. Default to the number of CPU cores on the machine
 
           [env: TOKENIZATION_WORKERS=]
 
@@ -175,14 +170,11 @@ Options:
           Possible values:
           - cls:        Select the CLS token as embedding
           - mean:       Apply Mean pooling to the model embeddings
-          - splade:     Apply SPLADE (Sparse Lexical and Expansion) to the model embeddings. This option is only
-          available if the loaded model is a `ForMaskedLM` Transformer model
+          - splade:     Apply SPLADE (Sparse Lexical and Expansion) to the model embeddings. This option is only available if the loaded model is a `ForMaskedLM` Transformer model
           - last-token: Select the last token as embedding
 
       --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
-          The maximum amount of concurrent requests for this particular deployment.
-          Having a low limit will refuse clients requests instead of having them wait for too long and is usually good
-          to handle backpressure correctly
+          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
 
           [env: MAX_CONCURRENT_REQUESTS=]
           [default: 512]
@@ -194,8 +186,7 @@ Options:
 
           For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
 
-          Overall this number should be the largest possible until the model is compute bound. Since the actual memory
-          overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
+          Overall this number should be the largest possible until the model is compute bound. Since the actual memory overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
 
           [env: MAX_BATCH_TOKENS=]
           [default: 16384]
@@ -223,9 +214,7 @@ Options:
 
           Must be a key in the `sentence-transformers` configuration `prompts` dictionary.
 
-          For example if ``default_prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, then the
-          sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?" because
-          the prompt text will be prepended before any text to encode.
+          For example if ``default_prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?" because the prompt text will be prepended before any text to encode.
 
           The argument '--default-prompt-name <DEFAULT_PROMPT_NAME>' cannot be used with '--default-prompt <DEFAULT_PROMPT>`
 
@@ -234,9 +223,7 @@ Options:
       --default-prompt <DEFAULT_PROMPT>
           The prompt that should be used by default for encoding. If not set, no prompt will be applied.
 
-          For example if ``default_prompt`` is "query: " then the sentence "What is the capital of France?" will be
-          encoded as "query: What is the capital of France?" because the prompt text will be prepended before any text
-          to encode.
+          For example if ``default_prompt`` is "query: " then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?" because the prompt text will be prepended before any text to encode.
 
           The argument '--default-prompt <DEFAULT_PROMPT>' cannot be used with '--default-prompt-name <DEFAULT_PROMPT_NAME>`
 
@@ -260,15 +247,13 @@ Options:
           [default: 3000]
 
       --uds-path <UDS_PATH>
-          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally
-          with gRPC
+          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally with gRPC
 
           [env: UDS_PATH=]
           [default: /tmp/text-embeddings-inference-server]
 
       --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
-          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk
-          for instance
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
 
           [env: HUGGINGFACE_HUB_CACHE=]
 
@@ -283,8 +268,7 @@ Options:
       --api-key <API_KEY>
           Set an api key for request authorization.
 
-          By default the server responds to every request. With an api key set, the requests must have the Authorization
-          header set with the api key as Bearer token.
+          By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token.
 
           [env: API_KEY=]
 
@@ -294,8 +278,6 @@ Options:
           [env: JSON_OUTPUT=]
 
       --disable-spans
-          Disables the span logging trace
-
           [env: DISABLE_SPANS=]
 
       --otlp-endpoint <OTLP_ENDPOINT>
@@ -309,8 +291,8 @@ Options:
           [env: OTLP_SERVICE_NAME=]
           [default: text-embeddings-inference.server]
 
-      --prometheus-port <PORT>
-          The Prometheus metrics port to listen on
+      --prometheus-port <PROMETHEUS_PORT>
+          The Prometheus port to listen on
 
           [env: PROMETHEUS_PORT=]
           [default: 9000]
@@ -319,6 +301,12 @@ Options:
           Unused for gRPC servers
 
           [env: CORS_ALLOW_ORIGIN=]
+
+      -h, --help
+          Print help (see a summary with '-h')
+
+      -V, --version
+          Print version
 ```
 
 ### Docker Images

diff --git a/docs/source/en/cli_arguments.md b/docs/source/en/cli_arguments.md
@@ -18,31 +18,26 @@ rendered properly in your Markdown viewer.
 
 To see all options to serve your models, run the following:
 
-```shell
-text-embeddings-router --help
-```
+```console
+$ text-embeddings-router --help
+Text Embedding Webserver
 
-```
 Usage: text-embeddings-router [OPTIONS]
 
 Options:
       --model-id <MODEL_ID>
-          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`.
-          Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of
-          transformers
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `BAAI/bge-large-en-v1.5`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
 
           [env: MODEL_ID=]
-          [default: thenlper/gte-base]
+          [default: BAAI/bge-large-en-v1.5]
 
       --revision <REVISION>
-          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id
-          or a branch like `refs/pr/2`
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
 
           [env: REVISION=]
 
       --tokenization-workers <TOKENIZATION_WORKERS>
-          Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation.
-          Default to the number of CPU cores on the machine
+          Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation. Default to the number of CPU cores on the machine
 
           [env: TOKENIZATION_WORKERS=]
 
@@ -64,14 +59,11 @@ Options:
           Possible values:
           - cls:        Select the CLS token as embedding
           - mean:       Apply Mean pooling to the model embeddings
-          - splade:     Apply SPLADE (Sparse Lexical and Expansion) to the model embeddings. This option is only
-          available if the loaded model is a `ForMaskedLM` Transformer model
+          - splade:     Apply SPLADE (Sparse Lexical and Expansion) to the model embeddings. This option is only available if the loaded model is a `ForMaskedLM` Transformer model
           - last-token: Select the last token as embedding
 
       --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
-          The maximum amount of concurrent requests for this particular deployment.
-          Having a low limit will refuse clients requests instead of having them wait for too long and is usually good
-          to handle backpressure correctly
+          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
 
           [env: MAX_CONCURRENT_REQUESTS=]
           [default: 512]
@@ -83,8 +75,7 @@ Options:
 
           For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
 
-          Overall this number should be the largest possible until the model is compute bound. Since the actual memory
-          overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
+          Overall this number should be the largest possible until the model is compute bound. Since the actual memory overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
 
           [env: MAX_BATCH_TOKENS=]
           [default: 16384]
@@ -112,9 +103,7 @@ Options:
 
           Must be a key in the `sentence-transformers` configuration `prompts` dictionary.
 
-          For example if ``default_prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, then the
-          sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?" because
-          the prompt text will be prepended before any text to encode.
+          For example if ``default_prompt_name`` is "query" and the ``prompts`` is {"query": "query: ", ...}, then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?" because the prompt text will be prepended before any text to encode.
 
           The argument '--default-prompt-name <DEFAULT_PROMPT_NAME>' cannot be used with '--default-prompt <DEFAULT_PROMPT>`
 
@@ -123,9 +112,7 @@ Options:
       --default-prompt <DEFAULT_PROMPT>
           The prompt that should be used by default for encoding. If not set, no prompt will be applied.
 
-          For example if ``default_prompt`` is "query: " then the sentence "What is the capital of France?" will be
-          encoded as "query: What is the capital of France?" because the prompt text will be prepended before any text
-          to encode.
+          For example if ``default_prompt`` is "query: " then the sentence "What is the capital of France?" will be encoded as "query: What is the capital of France?" because the prompt text will be prepended before any text to encode.
 
           The argument '--default-prompt <DEFAULT_PROMPT>' cannot be used with '--default-prompt-name <DEFAULT_PROMPT_NAME>`
 
@@ -149,15 +136,13 @@ Options:
           [default: 3000]
 
       --uds-path <UDS_PATH>
-          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally
-          with gRPC
+          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally with gRPC
 
           [env: UDS_PATH=]
           [default: /tmp/text-embeddings-inference-server]
 
       --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
-          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk
-          for instance
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
 
           [env: HUGGINGFACE_HUB_CACHE=]
 
@@ -172,8 +157,7 @@ Options:
       --api-key <API_KEY>
           Set an api key for request authorization.
 
-          By default the server responds to every request. With an api key set, the requests must have the Authorization
-          header set with the api key as Bearer token.
+          By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token.
 
           [env: API_KEY=]
 
@@ -183,8 +167,6 @@ Options:
           [env: JSON_OUTPUT=]
 
       --disable-spans
-          Disables the span logging trace
-
           [env: DISABLE_SPANS=]
 
       --otlp-endpoint <OTLP_ENDPOINT>
@@ -198,8 +180,20 @@ Options:
           [env: OTLP_SERVICE_NAME=]
           [default: text-embeddings-inference.server]
 
+      --prometheus-port <PROMETHEUS_PORT>
+          The Prometheus port to listen on
+
+          [env: PROMETHEUS_PORT=]
+          [default: 9000]
+
       --cors-allow-origin <CORS_ALLOW_ORIGIN>
           Unused for gRPC servers
 
           [env: CORS_ALLOW_ORIGIN=]
+
+      -h, --help
+          Print help (see a summary with '-h')
+
+      -V, --version
+          Print version
 ```