fix: limit peak memory to build cuda-all docker image (huggingface#246)

OlivierDehaene · web-flow · commit 9263eb12c2f6 · 2024-04-23T18:31:53.000+02:00
diff --git a/.github/workflows/build_all.yaml b/.github/workflows/build_all.yaml
@@ -3,6 +3,8 @@
  on:
    workflow_dispatch:
    push:
+     branches:
+       - 'main'
      tags:
        - 'v*'
 
diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all
@@ -33,49 +33,46 @@ FROM base-builder AS builder
 
 ARG GIT_SHA
 ARG DOCKER_LABEL
-ARG VERTEX
+ARG VERTEX="false"
 
 # sccache specific variables
 ARG ACTIONS_CACHE_URL
 ARG ACTIONS_RUNTIME_TOKEN
 ARG SCCACHE_GHA_ENABLED
 
 # limit the number of kernels built at the same time
-ARG RAYON_NUM_THREADS=2
+ARG RAYON_NUM_THREADS=4
 
 WORKDIR /usr/src
 
 COPY --from=planner /usr/src/recipe.json recipe.json
 
-FROM builder as builder-75
-
 RUN if [ $VERTEX = "true" ]; \
     then \
-      CUDA_COMPUTE_CAP=75 cargo chef cook --release --features google --features candle-cuda-turing --features http --no-default-features --recipe-path recipe.json && sccache -s; \
+      cargo chef cook --release --features google --recipe-path recipe.json && sccache -s; \
     else \
-      CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --no-default-features --features http --recipe-path recipe.json && sccache -s; \
+      cargo chef cook --release --recipe-path recipe.json && sccache -s; \
     fi;
 
-COPY backends backends
-COPY core core
-COPY router router
-COPY Cargo.toml ./
-COPY Cargo.lock ./
-
 RUN if [ $VERTEX = "true" ]; \
     then \
-        CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F http -F google --no-default-features && sccache -s; \
+      CUDA_COMPUTE_CAP=75 cargo chef cook --release --features google --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \
     else \
-        CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F http --no-default-features && sccache -s; \
+      CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \
     fi;
 
-FROM builder as builder-80
+RUN if [ $VERTEX = "true" ]; \
+    then \
+      CUDA_COMPUTE_CAP=80 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
+    else \
+      CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
+    fi;
 
 RUN if [ $VERTEX = "true" ]; \
     then \
-      CUDA_COMPUTE_CAP=80 cargo chef cook --release --features google --features candle-cuda --features http --no-default-features --recipe-path recipe.json && sccache -s; \
+      CUDA_COMPUTE_CAP=90 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
     else \
-      CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --no-default-features --features http --recipe-path recipe.json && sccache -s; \
+      CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
     fi;
 
 COPY backends backends
@@ -86,33 +83,31 @@ COPY Cargo.lock ./
 
 RUN if [ $VERTEX = "true" ]; \
     then \
-        CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F http -F google --no-default-features && sccache -s; \
+        CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F google  && sccache -s; \
     else \
-        CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F http --no-default-features && sccache -s; \
+        CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s; \
     fi;
 
-FROM builder as builder-90
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-75
 
 RUN if [ $VERTEX = "true" ]; \
     then \
-      CUDA_COMPUTE_CAP=90 cargo chef cook --release --features google --features candle-cuda --features http --no-default-features --recipe-path recipe.json && sccache -s; \
+        CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F google  && sccache -s; \
     else \
-      CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --features http --no-default-features --recipe-path recipe.json && sccache -s; \
+        CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
     fi;
 
-COPY backends backends
-COPY core core
-COPY router router
-COPY Cargo.toml ./
-COPY Cargo.lock ./
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-80
 
 RUN if [ $VERTEX = "true" ]; \
     then \
-        CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F http -F google --no-default-features && sccache -s; \
+        CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F google  && sccache -s; \
     else \
-        CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F http --no-default-features && sccache -s; \
+        CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
     fi;
 
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90
+
 FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 as base
 
 ARG DEFAULT_USE_FLASH_ATTENTION=True
@@ -121,9 +116,9 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80 \
     USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION
 
-COPY --from=builder-75 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-75
-COPY --from=builder-80 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-80
-COPY --from=builder-90 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-90
+COPY --from=builder /usr/src/target/release/text-embeddings-router-75 /usr/local/bin/text-embeddings-router-75
+COPY --from=builder /usr/src/target/release/text-embeddings-router-80 /usr/local/bin/text-embeddings-router-80
+COPY --from=builder /usr/src/target/release/text-embeddings-router-90 /usr/local/bin/text-embeddings-router-90
 
 # Amazon SageMaker compatible image
 FROM base AS sagemaker
diff --git a/backends/candle/Cargo.toml b/backends/candle/Cargo.toml
@@ -6,6 +6,7 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
+anyhow = "^1.0"
 accelerate-src = { version = "0.3.2", optional = true }
 intel-mkl-src = { version = "0.8.1", optional = true  }
 candle = { version = "*", package = "candle-core", default-features = false }
diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs
@@ -1,41 +1,26 @@
+use anyhow::Context;
+use candle::cuda_backend::cudarc::driver;
 use candle::cuda_backend::cudarc::driver::sys::CUdevice_attribute::{
     CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
 };
 use candle::cuda_backend::cudarc::driver::CudaDevice;
-use std::sync::Once;
 
-static INIT: Once = Once::new();
-static mut RUNTIME_COMPUTE_CAP: usize = 0;
-static mut COMPILE_COMPUTE_CAP: usize = 0;
-
-fn init_compute_caps() {
-    unsafe {
-        INIT.call_once(|| {
-            let device = CudaDevice::new(0).expect("cuda is not available");
-            let major = device
-                .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
-                .unwrap();
-            let minor = device
-                .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
-                .unwrap();
-            RUNTIME_COMPUTE_CAP = (major * 10 + minor) as usize;
-            COMPILE_COMPUTE_CAP = env!("CUDA_COMPUTE_CAP").parse::<usize>().unwrap();
-        });
-    }
-}
-
-pub fn get_compile_compute_cap() -> usize {
-    unsafe {
-        init_compute_caps();
-        COMPILE_COMPUTE_CAP
-    }
+pub fn get_compile_compute_cap() -> Result<usize, anyhow::Error> {
+    env!("CUDA_COMPUTE_CAP")
+        .parse::<usize>()
+        .context("Could not retrieve compile time CUDA_COMPUTE_CAP")
 }
 
-pub fn get_runtime_compute_cap() -> usize {
-    unsafe {
-        init_compute_caps();
-        RUNTIME_COMPUTE_CAP
-    }
+pub fn get_runtime_compute_cap() -> Result<usize, anyhow::Error> {
+    driver::result::init().context("CUDA is not available")?;
+    let device = CudaDevice::new(0).context("CUDA is not available")?;
+    let major = device
+        .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
+        .context("Could not retrieve device compute capability major")?;
+    let minor = device
+        .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
+        .context("Could not retrieve device compute capability minor")?;
+    Ok((major * 10 + minor) as usize)
 }
 
 fn compute_cap_matching(runtime_compute_cap: usize, compile_compute_cap: usize) -> bool {
@@ -49,10 +34,13 @@ fn compute_cap_matching(runtime_compute_cap: usize, compile_compute_cap: usize)
     }
 }
 
-pub fn incompatible_compute_cap() -> bool {
-    let compile_compute_cap = get_compile_compute_cap();
-    let runtime_compute_cap = get_runtime_compute_cap();
-    !compute_cap_matching(runtime_compute_cap, compile_compute_cap)
+pub fn compatible_compute_cap() -> Result<bool, anyhow::Error> {
+    let compile_compute_cap = get_compile_compute_cap()?;
+    let runtime_compute_cap = get_runtime_compute_cap()?;
+    Ok(compute_cap_matching(
+        runtime_compute_cap,
+        compile_compute_cap,
+    ))
 }
 
 #[cfg(test)]
diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs
@@ -1,5 +1,23 @@
-use crate::compute_cap::get_runtime_compute_cap;
 use candle::Tensor;
+use std::sync::Once;
+
+static INIT: Once = Once::new();
+static mut RUNTIME_COMPUTE_CAP: usize = 0;
+fn init_runtime_compute_cap() {
+    unsafe {
+        INIT.call_once(|| {
+            use crate::compute_cap::get_runtime_compute_cap;
+            RUNTIME_COMPUTE_CAP = get_runtime_compute_cap().unwrap();
+        });
+    }
+}
+
+pub fn get_runtime_compute_cap() -> usize {
+    unsafe {
+        init_runtime_compute_cap();
+        RUNTIME_COMPUTE_CAP
+    }
+}
 
 #[allow(clippy::too_many_arguments, unused)]
 pub(crate) fn flash_attn_varlen(
diff --git a/backends/candle/src/layers/cublaslt.rs b/backends/candle/src/layers/cublaslt.rs
@@ -11,21 +11,27 @@ static mut CUBLASLT: Option<CublasLtWrapper> = None;
 pub fn get_cublas_lt_wrapper() -> Option<&'static CublasLtWrapper> {
     unsafe {
         INIT.call_once(|| {
-            CUBLASLT = match Device::cuda_if_available(0) {
-                Ok(device) => {
-                    #[cfg(feature = "cuda")]
-                    {
-                        Some(CublasLtWrapper {
+            #[cfg(not(feature = "cuda"))]
+            {
+                CUBLASLT = None;
+            }
+
+            #[cfg(feature = "cuda")]
+            {
+                // Check if we can call the driver
+                // Then check if we can create a device
+                // Then check that the device is CUDA
+                use candle::cuda_backend::cudarc::driver;
+                CUBLASLT = driver::result::init()
+                    .ok()
+                    .and_then(|_| Device::cuda_if_available(0).ok())
+                    .and_then(|device| match device {
+                        Device::Cuda(_) => Some(CublasLtWrapper {
                             cublaslt: CublasLt::new(&device).unwrap(),
-                        })
-                    }
-                    #[cfg(not(feature = "cuda"))]
-                    {
-                        None
-                    }
-                }
-                Err(_) => None,
-            };
+                        }),
+                        _ => None,
+                    });
+            }
         });
         CUBLASLT.as_ref()
     }
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -8,7 +8,7 @@ mod models;
 
 #[cfg(feature = "cuda")]
 use crate::compute_cap::{
-    get_compile_compute_cap, get_runtime_compute_cap, incompatible_compute_cap,
+    compatible_compute_cap, get_compile_compute_cap, get_runtime_compute_cap,
 };
 use crate::models::{
     BertModel, DistilBertConfig, DistilBertModel, JinaBertModel, Model, NomicBertModel,
@@ -43,6 +43,7 @@ enum Config {
 }
 
 pub struct CandleBackend {
+    device: Device,
     model: Box<dyn Model + Send>,
 }
 
@@ -61,14 +62,23 @@ impl CandleBackend {
         // Get candle device
         let device = if candle::utils::cuda_is_available() {
             #[cfg(feature = "cuda")]
-            if incompatible_compute_cap() {
-                return Err(BackendError::Start(format!(
-                    "Runtime compute cap {} is not compatible with compile time compute cap {}",
-                    get_runtime_compute_cap(),
-                    get_compile_compute_cap()
-                )));
+            match compatible_compute_cap() {
+                Ok(true) => Device::new_cuda(0),
+                Ok(false) => {
+                    return Err(BackendError::Start(format!(
+                        "Runtime compute cap {} is not compatible with compile time compute cap {}",
+                        get_runtime_compute_cap().unwrap(),
+                        get_compile_compute_cap().unwrap()
+                    )))
+                }
+                Err(err) => {
+                    tracing::warn!("Could not find a compatible CUDA device on host: {err}");
+                    tracing::warn!("Using CPU instead");
+                    Ok(Device::Cpu)
+                }
             }
-            Device::new_cuda(0)
+            #[cfg(not(feature = "cuda"))]
+            Ok(Device::Cpu)
         } else if candle::utils::metal_is_available() {
             Device::new_metal(0)
         } else {
@@ -225,11 +235,22 @@ impl CandleBackend {
             }
         };
 
-        Ok(Self { model: model? })
+        Ok(Self {
+            device,
+            model: model?,
+        })
     }
 }
 
 impl Backend for CandleBackend {
+    fn max_batch_size(&self) -> Option<usize> {
+        // Limit max batch size to 4 on CPU
+        if matches!(self.device, Device::Cpu) {
+            return Some(4);
+        }
+        None
+    }
+
     fn health(&self) -> Result<(), BackendError> {
         Ok(())
     }
diff --git a/backends/candle/src/models/bert.rs b/backends/candle/src/models/bert.rs
@@ -405,13 +405,14 @@ impl ClassificationHead for BertClassificationHead {
     fn forward(&self, hidden_states: &Tensor) -> Result<Tensor> {
         let _enter = self.span.enter();
 
-        let mut hidden_states = hidden_states.clone();
+        let mut hidden_states = hidden_states.unsqueeze(1)?;
         if let Some(pooler) = self.pooler.as_ref() {
             hidden_states = pooler.forward(&hidden_states)?;
             hidden_states = hidden_states.tanh()?;
         }
 
         let hidden_states = self.output.forward(&hidden_states)?;
+        let hidden_states = hidden_states.squeeze(1)?;
         Ok(hidden_states)
     }
 }
@@ -453,10 +454,11 @@ impl ClassificationHead for RobertaClassificationHead {
     fn forward(&self, hidden_states: &Tensor) -> Result<Tensor> {
         let _enter = self.span.enter();
 
-        let hidden_states = self.intermediate.forward(hidden_states)?;
+        let hidden_states = hidden_states.unsqueeze(1)?;
+        let hidden_states = self.intermediate.forward(&hidden_states)?;
         let hidden_states = hidden_states.tanh()?;
         let hidden_states = self.output.forward(&hidden_states)?;
-
+        let hidden_states = hidden_states.squeeze(1)?;
         Ok(hidden_states)
     }
 }
diff --git a/router/src/http/server.rs b/router/src/http/server.rs
@@ -1548,7 +1548,9 @@ pub async fn run(
     }
 
     // Run server
-    let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
+    let listener = tokio::net::TcpListener::bind(&addr)
+        .await
+        .context(format!("Could not bind TCP Listener on {addr}"))?;
 
     tracing::info!("Starting HTTP server: {}", &addr);
     tracing::info!("Ready");
diff --git a/router/tests/common.rs b/router/tests/common.rs
diff --git a/sagemaker-entrypoint.sh b/sagemaker-entrypoint.sh

Original file line number	Diff line number	Diff line change
`@@ -405,13 +405,14 @@ impl ClassificationHead for BertClassificationHead {`
`405`	`405`	`fn forward(&self, hidden_states: &Tensor) -> Result<Tensor> {`
`406`	`406`	`let _enter = self.span.enter();`
`407`	`407`
`408`		`- let mut hidden_states = hidden_states.clone();`
	`408`	`+ let mut hidden_states = hidden_states.unsqueeze(1)?;`
`409`	`409`	`if let Some(pooler) = self.pooler.as_ref() {`
`410`	`410`	`hidden_states = pooler.forward(&hidden_states)?;`
`411`	`411`	`hidden_states = hidden_states.tanh()?;`
`412`	`412`	`}`
`413`	`413`
`414`	`414`	`let hidden_states = self.output.forward(&hidden_states)?;`
	`415`	`+ let hidden_states = hidden_states.squeeze(1)?;`
`415`	`416`	`Ok(hidden_states)`
`416`	`417`	`}`
`417`	`418`	`}`
`@@ -453,10 +454,11 @@ impl ClassificationHead for RobertaClassificationHead {`
`453`	`454`	`fn forward(&self, hidden_states: &Tensor) -> Result<Tensor> {`
`454`	`455`	`let _enter = self.span.enter();`
`455`	`456`
`456`		`- let hidden_states = self.intermediate.forward(hidden_states)?;`
	`457`	`+ let hidden_states = hidden_states.unsqueeze(1)?;`
	`458`	`+ let hidden_states = self.intermediate.forward(&hidden_states)?;`
`457`	`459`	`let hidden_states = hidden_states.tanh()?;`
`458`	`460`	`let hidden_states = self.output.forward(&hidden_states)?;`
`459`		`-`
	`461`	`+ let hidden_states = hidden_states.squeeze(1)?;`
`460`	`462`	`Ok(hidden_states)`
`461`	`463`	`}`
`462`	`464`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1548,7 +1548,9 @@ pub async fn run(`
`1548`	`1548`	`}`
`1549`	`1549`
`1550`	`1550`	`// Run server`
`1551`		`- let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();`
	`1551`	`+ let listener = tokio::net::TcpListener::bind(&addr)`
	`1552`	`+ .await`
	`1553`	`+ .context(format!("Could not bind TCP Listener on {addr}"))?;`
`1552`	`1554`
`1553`	`1555`	`tracing::info!("Starting HTTP server: {}", &addr);`
`1554`	`1556`	`tracing::info!("Ready");`