diff --git a/benchmarks/big_model_inference/README.md b/benchmarks/big_model_inference/README.md index 243e9df58ea..8736bec2467 100644 --- a/benchmarks/big_model_inference/README.md +++ b/benchmarks/big_model_inference/README.md @@ -13,7 +13,7 @@ pip install transformers To reproduce or test a new setup, run ```py -python inference_acc.py model_name +python big_model_inference.py model_name ``` This script supports `gpt-j-6b`, `gpt-neox`, `opt` (30B version) and `T0pp` out of the box, but you can specify any valid checkpoint for `model_name`. @@ -43,4 +43,4 @@ Note on the results: You will also note that Accelerate does not use anymore GPU and CPU RAM than necessary: - peak GPU memory is exactly the size of the model put on a given GPU -- peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger. \ No newline at end of file +- peak CPU memory is either the size of the biggest checkpoint shard or the part of the model offloaded on CPU, whichever is bigger. diff --git a/benchmarks/big_model_inference/measures_util.py b/benchmarks/big_model_inference/measures_util.py index f22c0792f45..00fb0dbfb85 100644 --- a/benchmarks/big_model_inference/measures_util.py +++ b/benchmarks/big_model_inference/measures_util.py @@ -18,6 +18,12 @@ import psutil import torch +from accelerate.test_utils.testing import get_backend + + +torch_device_type, _, _ = get_backend() +torch_accelerator_module = getattr(torch, torch_device_type, torch.cuda) + class PeakCPUMemory: def __init__(self): @@ -54,16 +60,16 @@ def start_measure(): measures = {"time": time.time()} gc.collect() - torch.cuda.empty_cache() + torch_accelerator_module.empty_cache() # CPU mem measures["cpu"] = psutil.Process().memory_info().rss cpu_peak_tracker.start() # GPU mem - for i in range(torch.cuda.device_count()): - measures[str(i)] = torch.cuda.memory_allocated(i) - torch.cuda.reset_peak_memory_stats() + for i in range(torch_accelerator_module.device_count()): + measures[str(i)] = torch_accelerator_module.memory_allocated(i) + torch_accelerator_module.reset_peak_memory_stats() return measures @@ -73,16 +79,16 @@ def end_measure(start_measures): measures = {"time": time.time() - start_measures["time"]} gc.collect() - torch.cuda.empty_cache() + torch_accelerator_module.empty_cache() # CPU mem measures["cpu"] = (psutil.Process().memory_info().rss - start_measures["cpu"]) / 2**20 measures["cpu-peak"] = (cpu_peak_tracker.stop() - start_measures["cpu"]) / 2**20 # GPU mem - for i in range(torch.cuda.device_count()): - measures[str(i)] = (torch.cuda.memory_allocated(i) - start_measures[str(i)]) / 2**20 - measures[f"{i}-peak"] = (torch.cuda.max_memory_allocated(i) - start_measures[str(i)]) / 2**20 + for i in range(torch_accelerator_module.device_count()): + measures[str(i)] = (torch_accelerator_module.memory_allocated(i) - start_measures[str(i)]) / 2**20 + measures[f"{i}-peak"] = (torch_accelerator_module.max_memory_allocated(i) - start_measures[str(i)]) / 2**20 return measures @@ -90,9 +96,9 @@ def end_measure(start_measures): def log_measures(measures, description): print(f"{description}:") print(f"- Time: {measures['time']:.2f}s") - for i in range(torch.cuda.device_count()): - print(f"- GPU {i} allocated: {measures[str(i)]:.2f}MiB") + for i in range(torch_accelerator_module.device_count()): + print(f"- {torch_device_type} {i} allocated: {measures[str(i)]:.2f}MiB") peak = measures[f"{i}-peak"] - print(f"- GPU {i} peak: {peak:.2f}MiB") + print(f"- {torch_device_type} {i} peak: {peak:.2f}MiB") print(f"- CPU RAM allocated: {measures['cpu']:.2f}MiB") print(f"- CPU RAM peak: {measures['cpu-peak']:.2f}MiB")