Add huggingface/pytorch/tei/docker/1.7.0 (#141)

alvarobartt · fgbelidji · varunmoris · web-flow · commit 3b88895c94a6 · 2025-04-30T16:38:27.000-04:00
* Add `huggingface/pytorch/tei/docker/1.6.1` * Update `releases.json` * Apply suggestions from code review Co-Authored-by: Florent Gbelidji <florent@huggingface.co> * Add `HF_HUB_USER_AGENT_ORIGIN` Introduced in huggingface/text-embeddings-inference#534 * Fix spacing in `Dockerfile` for CPU * Add `start-cuda-compat.sh` and copy in `Dockerfile` * Update `README.md` * Add `start-cuda-compat.sh` * Rename 1.6.1 to 1.7.0 (just released) * Update `Dockerfile` for CPU * Update `Dockerfile` for GPU * Run `ruff` * Add `subprocess` to update submodules * Add `libgssapi-krb5-2` to patch CVE Apparently only required for GPU builds * Fix path to `start-cuda-compat.sh` in `Dockerfile` --------- Co-authored-by: Florent Gbelidji <florent@huggingface.co> Co-authored-by: varunmoris <176621270+varunmoris@users.noreply.github.com>
diff --git a/README.md b/README.md
@@ -2,14 +2,15 @@
 
 Welcome to the LLM Hosting Container GitHub repository! 
 
-This repository contains Dockerfile and associated resources for building and
-hosting containers for large language models.
+This repository contains the Dockerfiles and associated resources for building and
+hosting containers for large language models and embedding models.
 
-* HuggingFace Text Generation Inference (TGI) container
+* Hugging Face Text Generation Inference (TGI) container
+* Hugging Face Text Embeddings Inference (TEI) container
 
 ## Security
 
-See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
+See [CONTRIBUTING](CONTRIBUTING.md) for more information.
 
 ## License
 
diff --git a/huggingface/pytorch/tei/docker/1.7.0/cpu/Dockerfile b/huggingface/pytorch/tei/docker/1.7.0/cpu/Dockerfile
@@ -0,0 +1,120 @@
+FROM lukemathwalker/cargo-chef:latest-rust-1.85-bookworm AS chef
+WORKDIR /usr/src
+
+ENV SCCACHE=0.5.4
+ENV RUSTC_WRAPPER=/usr/local/bin/sccache
+
+# Donwload, configure sccache
+RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
+    chmod +x /usr/local/bin/sccache
+
+FROM chef AS planner
+
+COPY candle-extensions candle-extensions
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM chef AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+# sccache specific variables
+ARG SCCACHE_GHA_ENABLED
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+    | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \
+    tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    intel-oneapi-mkl-devel=2024.0.0-49656 \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN echo "int mkl_serv_intel_cpu_true() {return 1;}" > fakeintel.c && \
+    gcc -shared -fPIC -o libfakeintel.so fakeintel.c
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    cargo chef cook --release --features ort,candle,mkl --no-default-features --recipe-path recipe.json && sccache -s
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+FROM builder AS http-builder
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    cargo build --release --bin text-embeddings-router --features ort,candle,mkl,http --no-default-features && sccache -s
+
+FROM builder AS grpc-builder
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY proto proto
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    cargo build --release --bin text-embeddings-router --features ort,candle,mkl,grpc --no-default-features && sccache -s
+
+FROM debian:bookworm-slim AS base
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    PORT=80 \
+    HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:cpu:inference:tei \
+    MKL_ENABLE_INSTRUCTIONS=AVX512_E4 \
+    RAYON_NUM_THREADS=8 \
+    LD_PRELOAD=/usr/local/libfakeintel.so \
+    LD_LIBRARY_PATH=/usr/local/lib
+
+RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    libomp-dev \
+    ca-certificates \
+    libssl-dev \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy a lot of the Intel shared objects because of the mkl_serv_intel_cpu_true patch...
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_intel_lp64.so.2 /usr/local/lib/libmkl_intel_lp64.so.2
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_intel_thread.so.2 /usr/local/lib/libmkl_intel_thread.so.2
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_core.so.2 /usr/local/lib/libmkl_core.so.2
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_def.so.2 /usr/local/lib/libmkl_vml_def.so.2
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_def.so.2 /usr/local/lib/libmkl_def.so.2
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_avx2.so.2 /usr/local/lib/libmkl_vml_avx2.so.2
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_avx512.so.2 /usr/local/lib/libmkl_vml_avx512.so.2
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_avx2.so.2 /usr/local/lib/libmkl_avx2.so.2
+COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_avx512.so.2 /usr/local/lib/libmkl_avx512.so.2
+COPY --from=builder /usr/src/libfakeintel.so /usr/local/libfakeintel.so
+
+FROM base AS grpc
+
+COPY --from=grpc-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
+
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]
+
+FROM base AS http
+
+COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
+
+# Amazon SageMaker compatible image
+FROM http AS sagemaker
+
+COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
diff --git a/huggingface/pytorch/tei/docker/1.7.0/gpu/Dockerfile b/huggingface/pytorch/tei/docker/1.7.0/gpu/Dockerfile
@@ -0,0 +1,119 @@
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
+
+ENV SCCACHE=0.5.4
+ENV RUSTC_WRAPPER=/usr/local/bin/sccache
+ENV PATH="/root/.cargo/bin:${PATH}"
+# aligned with `cargo-chef` version in `lukemathwalker/cargo-chef:latest-rust-1.85-bookworm`
+ENV CARGO_CHEF=0.1.71
+
+RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    curl \
+    libssl-dev \
+    pkg-config \
+    libgssapi-krb5-2 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Donwload and configure sccache
+RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
+    chmod +x /usr/local/bin/sccache
+
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+RUN cargo install cargo-chef --version $CARGO_CHEF --locked
+
+FROM base-builder AS planner
+
+WORKDIR /usr/src
+
+COPY candle-extensions candle-extensions
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM base-builder AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+# sccache specific variables
+ARG SCCACHE_GHA_ENABLED
+
+# Limit parallelism
+ARG RAYON_NUM_THREADS=4
+ARG CARGO_BUILD_JOBS
+ARG CARGO_BUILD_INCREMENTAL
+
+WORKDIR /usr/src
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    cargo chef cook --release --recipe-path recipe.json && sccache -s;
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s;
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
+
+COPY candle-extensions candle-extensions
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s;
+
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-75
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
+
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-80
+
+RUN --mount=type=secret,id=actions_cache_url,env=ACTIONS_CACHE_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
+
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90
+
+FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 AS base
+
+ARG DEFAULT_USE_FLASH_ATTENTION=True
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    PORT=80 \
+    USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION \
+    HF_HUB_USER_AGENT_ORIGIN=aws:sagemaker:gpu-cuda:inference:tei
+
+RUN apt-get update && apt-get upgrade -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    libssl-dev \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /usr/src/target/release/text-embeddings-router-75 /usr/local/bin/text-embeddings-router-75
+COPY --from=builder /usr/src/target/release/text-embeddings-router-80 /usr/local/bin/text-embeddings-router-80
+COPY --from=builder /usr/src/target/release/text-embeddings-router-90 /usr/local/bin/text-embeddings-router-90
+
+# Amazon SageMaker compatible image
+FROM base AS sagemaker
+
+COPY --chmod=775 sagemaker-entrypoint-cuda-all.sh entrypoint.sh
+COPY --chmod=775 /huggingface/pytorch/tei/docker/1.7.0/gpu/start-cuda-compat.sh start-cuda-compat.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
diff --git a/huggingface/pytorch/tei/docker/1.7.0/gpu/start-cuda-compat.sh b/huggingface/pytorch/tei/docker/1.7.0/gpu/start-cuda-compat.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+verlt() {
+    [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
+    CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d'.' -f 3-)
+    echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+    NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+    echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+    if verlt $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+        echo "Adding CUDA compat to LD_LIBRARY_PATH"
+        export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+        echo $LD_LIBRARY_PATH
+    else
+        echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+    fi
+else
+    echo "Skipping CUDA compat setup as package not found"
+fi
diff --git a/huggingface/pytorch/tei/docker/tei.py b/huggingface/pytorch/tei/docker/tei.py
diff --git a/releases.json b/releases.json