huggingface · SunMarc · Jun 3, 2025 · May 26, 2025 · May 26, 2025 · May 26, 2025
diff --git a/.github/workflows/gaudi1.yml → .github/workflows/gaudi3_scheduled.yml b/.github/workflows/gaudi1.yml → .github/workflows/gaudi3_scheduled.yml
@@ -1,23 +1,22 @@
-name: Gaudi1 tests (scheduled)
+name: Gaudi3 tests (scheduled)
 
 on:
   workflow_dispatch:
-  schedule:
-    - cron: "0 2 * * *"
+  schedule: # every day at 6 AM UTC
+    - cron: "0 6 * * *"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
 jobs:
-  run_gaudi1_tests:
-    name: Test on Gaudi1
+  run-gaudi3-tests:
     runs-on:
-      group: aws-dl1-24xlarge
+      group: itac-bm-emr-gaudi3-dell-2gaudi
 
     container:
       image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-      options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES=0,1
+      options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
       env:
         OMPI_MCA_btl_vader_single_copy_mechanism: none
         PT_ENABLE_INT64_SUPPORT: 1
@@ -50,28 +49,34 @@ jobs:
         run: |
           pip install -e .[testing] \
             git+https://github.com/HabanaAI/[email protected] \
-            git+https://github.com/huggingface/transformers.git@hpu-support
+            git+https://github.com/huggingface/transformers.git
 
       - name: Run CLI tests
+        if: ${{ !cancelled() && (success() || failure()) }}
         run: |
           make test_cli
 
       - name: Run Core tests
+        if: ${{ !cancelled() && (success() || failure()) }}
         run: |
           make test_core
 
       - name: Run Big Modeling tests
+        if: ${{ !cancelled() && (success() || failure()) }}
         run: |
           make test_big_modeling
 
       - name: Run FSDP integration tests
+        if: ${{ !cancelled() && (success() || failure()) }}
         run: |
           make test_fsdp
 
       - name: Run DeepSpeed integration tests
+        if: ${{ !cancelled() && (success() || failure()) }}
         run: |
           make test_deepspeed
 
       - name: Run Examples tests
+        if: ${{ !cancelled() && (success() || failure()) }}
         run: |
           make test_examples
diff --git a/tests/test_compile.py b/tests/test_compile.py
@@ -16,7 +16,7 @@
 import torch
 from torch.utils.benchmark import Timer
 
-from accelerate.test_utils import require_huggingface_suite, require_non_cpu, torch_device
+from accelerate.test_utils import require_huggingface_suite, require_non_cpu, require_non_hpu, torch_device
 from accelerate.utils import compile_regions, extract_model_from_parallel, release_memory
 
 
@@ -28,7 +28,13 @@
 INFRENCE_STMT = "model(input_ids, use_cache=False)"
 COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}"
 
+if torch_device == "hpu":
+    backend = "hpu_backend"
+else:
+    backend = "inductor"
 
+
+@require_non_hpu
 @require_huggingface_suite
 class RegionalCompilationTester(unittest.TestCase):
     def _get_model_and_inputs(self):
@@ -43,7 +49,7 @@ def _get_model_and_inputs(self):
 
     def test_regions_are_compiled(self):
         model, _ = self._get_model_and_inputs()
-        compiled_model = compile_regions(model, mode="reduce-overhead")
+        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
 
         # Check that the compiled model keeps a reference to the original model
         assert hasattr(compiled_model, "_orig_mod")
@@ -55,20 +61,20 @@ def test_regions_are_compiled(self):
 
     def test_extract_model_keep_torch_compile(self):
         model, _ = self._get_model_and_inputs()
-        compiled_model = compile_regions(model)
+        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
 
         distributed_model = torch.nn.parallel.DataParallel(model)
-        distributed_compiled_model = compile_regions(distributed_model)
+        distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
         compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=True)
 
         assert compiled_model._orig_mod is compiled_model_unwrapped._orig_mod
 
     def test_extract_model_remove_torch_compile(self):
         model, _ = self._get_model_and_inputs()
-        compiled_model = compile_regions(model)
+        compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)
 
         distributed_model = torch.nn.parallel.DataParallel(model)
-        distributed_compiled_model = compile_regions(distributed_model)
+        distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
         compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=False)
 
         assert compiled_model._orig_mod is compiled_model_unwrapped
@@ -78,14 +84,14 @@ def test_extract_model_remove_torch_compile(self):
     def test_regional_compilation_cold_start(self):
         model, input_ids = self._get_model_and_inputs()
 
-        regional_compilation_model = compile_regions(model)
+        regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
         regional_compilation_cold_start = (
             Timer(stmt=COMPILE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
             .timeit(COMPILE_ITERS)
             .median
         )
 
-        full_compilation_model = torch.compile(model)
+        full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
         full_compilation_cold_start = (
             Timer(stmt=COMPILE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
             .timeit(COMPILE_ITERS)
@@ -109,14 +115,14 @@ def test_regional_compilation_inference_speedup(self):
             Timer(stmt=INFRENCE_STMT, globals={"model": model, "input_ids": input_ids}).timeit(INFERENCE_ITERS).median
         )
 
-        regional_compilation_model = compile_regions(model)
+        regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
         regional_compilation_inference_latency = (
             Timer(stmt=INFRENCE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
             .timeit(INFERENCE_ITERS)
             .median
         )
 
-        full_compilation_model = torch.compile(model)
+        full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
         full_compilation_inference_latency = (
             Timer(stmt=INFRENCE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
             .timeit(INFERENCE_ITERS)

diff --git a/tests/test_load_checkpoint_and_dispatch_with_broadcast.py b/tests/test_load_checkpoint_and_dispatch_with_broadcast.py
@@ -33,10 +33,11 @@
     execute_subprocess_async,
     get_torch_dist_unique_port,
     require_multi_device,
+    run_first,
     torch_device,
 )
 from accelerate.test_utils.testing import require_torch_min_version, require_transformers
-from accelerate.utils.imports import is_transformers_available, is_xccl_available
+from accelerate.utils.imports import is_hpu_available, is_transformers_available, is_xccl_available
 
 
 if is_transformers_available():
@@ -53,6 +54,8 @@ def wrapped(*args: Any, **kwargs: Any) -> Any:
         #       pytorch built-in xccl will be available from PyTorch 2.9, will remove this after we have xccl
         if torch_device == "xpu" and not is_xccl_available():
             dist.init_process_group(backend="ccl", world_size=torch_accelerator_module.device_count())
+        elif torch_device == "hpu" and is_hpu_available(init_hccl=True):
+            dist.init_process_group(backend="hccl", world_size=torch_accelerator_module.device_count())
         else:
             dist.init_process_group(world_size=torch_accelerator_module.device_count())
         try:
@@ -188,6 +191,7 @@ def load_checkpoint_and_dispatch_ddp():
 @require_torch_min_version(version="2.4.0")
 @require_transformers
 @require_multi_device
+@run_first
 class TestLoadCheckpointAndDispatchWithBroadcast(unittest.TestCase):
     def setUp(self):
         self.torch_accelerator_module = getattr(torch, torch_device, torch.cuda)