Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
name: Gaudi1 tests (scheduled)
name: Gaudi3 tests (scheduled)

on:
workflow_dispatch:
schedule:
- cron: "0 2 * * *"
schedule: # every day at 6 AM UTC
- cron: "0 6 * * *"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
run_gaudi1_tests:
name: Test on Gaudi1
run-gaudi3-tests:
runs-on:
group: aws-dl1-24xlarge
group: itac-bm-emr-gaudi3-dell-2gaudi

container:
image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES=0,1
options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES
env:
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_ENABLE_INT64_SUPPORT: 1
Expand Down Expand Up @@ -50,28 +49,34 @@ jobs:
run: |
pip install -e .[testing] \
git+https://github.com/HabanaAI/[email protected] \
git+https://github.com/huggingface/transformers.git@hpu-support
git+https://github.com/huggingface/transformers.git

- name: Run CLI tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_cli

- name: Run Core tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_core

- name: Run Big Modeling tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_big_modeling

- name: Run FSDP integration tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_fsdp

- name: Run DeepSpeed integration tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_deepspeed

- name: Run Examples tests
if: ${{ !cancelled() && (success() || failure()) }}
run: |
make test_examples
26 changes: 16 additions & 10 deletions tests/test_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import torch
from torch.utils.benchmark import Timer

from accelerate.test_utils import require_huggingface_suite, require_non_cpu, torch_device
from accelerate.test_utils import require_huggingface_suite, require_non_cpu, require_non_hpu, torch_device
from accelerate.utils import compile_regions, extract_model_from_parallel, release_memory


Expand All @@ -28,7 +28,13 @@
INFRENCE_STMT = "model(input_ids, use_cache=False)"
COMPILE_STMT = f"torch._dynamo.reset(); torch._inductor.utils.clear_inductor_caches(); {INFRENCE_STMT}"

if torch_device == "hpu":
backend = "hpu_backend"
else:
backend = "inductor"


@require_non_hpu
@require_huggingface_suite
class RegionalCompilationTester(unittest.TestCase):
def _get_model_and_inputs(self):
Expand All @@ -43,7 +49,7 @@ def _get_model_and_inputs(self):

def test_regions_are_compiled(self):
model, _ = self._get_model_and_inputs()
compiled_model = compile_regions(model, mode="reduce-overhead")
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

# Check that the compiled model keeps a reference to the original model
assert hasattr(compiled_model, "_orig_mod")
Expand All @@ -55,20 +61,20 @@ def test_regions_are_compiled(self):

def test_extract_model_keep_torch_compile(self):
model, _ = self._get_model_and_inputs()
compiled_model = compile_regions(model)
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

distributed_model = torch.nn.parallel.DataParallel(model)
distributed_compiled_model = compile_regions(distributed_model)
distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=True)

assert compiled_model._orig_mod is compiled_model_unwrapped._orig_mod

def test_extract_model_remove_torch_compile(self):
model, _ = self._get_model_and_inputs()
compiled_model = compile_regions(model)
compiled_model = compile_regions(model, mode="reduce-overhead", backend=backend)

distributed_model = torch.nn.parallel.DataParallel(model)
distributed_compiled_model = compile_regions(distributed_model)
distributed_compiled_model = compile_regions(distributed_model, mode="reduce-overhead", backend=backend)
compiled_model_unwrapped = extract_model_from_parallel(distributed_compiled_model, keep_torch_compile=False)

assert compiled_model._orig_mod is compiled_model_unwrapped
Expand All @@ -78,14 +84,14 @@ def test_extract_model_remove_torch_compile(self):
def test_regional_compilation_cold_start(self):
model, input_ids = self._get_model_and_inputs()

regional_compilation_model = compile_regions(model)
regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
regional_compilation_cold_start = (
Timer(stmt=COMPILE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
.timeit(COMPILE_ITERS)
.median
)

full_compilation_model = torch.compile(model)
full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
full_compilation_cold_start = (
Timer(stmt=COMPILE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
.timeit(COMPILE_ITERS)
Expand All @@ -109,14 +115,14 @@ def test_regional_compilation_inference_speedup(self):
Timer(stmt=INFRENCE_STMT, globals={"model": model, "input_ids": input_ids}).timeit(INFERENCE_ITERS).median
)

regional_compilation_model = compile_regions(model)
regional_compilation_model = compile_regions(model, mode="reduce-overhead", backend=backend)
regional_compilation_inference_latency = (
Timer(stmt=INFRENCE_STMT, globals={"model": regional_compilation_model, "input_ids": input_ids})
.timeit(INFERENCE_ITERS)
.median
)

full_compilation_model = torch.compile(model)
full_compilation_model = torch.compile(model, mode="reduce-overhead", backend=backend)
full_compilation_inference_latency = (
Timer(stmt=INFRENCE_STMT, globals={"model": full_compilation_model, "input_ids": input_ids})
.timeit(INFERENCE_ITERS)
Expand Down
6 changes: 5 additions & 1 deletion tests/test_load_checkpoint_and_dispatch_with_broadcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@
execute_subprocess_async,
get_torch_dist_unique_port,
require_multi_device,
run_first,
torch_device,
)
from accelerate.test_utils.testing import require_torch_min_version, require_transformers
from accelerate.utils.imports import is_transformers_available, is_xccl_available
from accelerate.utils.imports import is_hpu_available, is_transformers_available, is_xccl_available


if is_transformers_available():
Expand All @@ -53,6 +54,8 @@ def wrapped(*args: Any, **kwargs: Any) -> Any:
# pytorch built-in xccl will be available from PyTorch 2.9, will remove this after we have xccl
if torch_device == "xpu" and not is_xccl_available():
dist.init_process_group(backend="ccl", world_size=torch_accelerator_module.device_count())
elif torch_device == "hpu" and is_hpu_available(init_hccl=True):
dist.init_process_group(backend="hccl", world_size=torch_accelerator_module.device_count())
else:
dist.init_process_group(world_size=torch_accelerator_module.device_count())
try:
Expand Down Expand Up @@ -188,6 +191,7 @@ def load_checkpoint_and_dispatch_ddp():
@require_torch_min_version(version="2.4.0")
@require_transformers
@require_multi_device
@run_first
class TestLoadCheckpointAndDispatchWithBroadcast(unittest.TestCase):
def setUp(self):
self.torch_accelerator_module = getattr(torch, torch_device, torch.cuda)
Expand Down
Loading