-
Notifications
You must be signed in to change notification settings - Fork 6.1k
Speedup model loading by 4-5x ⚡ #11904
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
7a9c448
8385f45
9e4873b
20b1155
b776aaa
e364dfd
ea446b1
e736b09
4c81c96
591655e
582af9b
a6ee660
bbbc4c0
39f0850
58fcfdc
b0552bb
275e470
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -16,9 +16,10 @@ | |||
|
||||
import importlib | ||||
import inspect | ||||
import math | ||||
import os | ||||
from array import array | ||||
from collections import OrderedDict | ||||
from collections import OrderedDict, defaultdict | ||||
from pathlib import Path | ||||
from typing import Dict, List, Optional, Union | ||||
from zipfile import is_zipfile | ||||
|
@@ -38,6 +39,7 @@ | |||
_get_model_file, | ||||
deprecate, | ||||
is_accelerate_available, | ||||
is_accelerate_version, | ||||
is_gguf_available, | ||||
is_torch_available, | ||||
is_torch_version, | ||||
|
@@ -252,6 +254,10 @@ def load_model_dict_into_meta( | |||
param = param.to(dtype) | ||||
set_module_kwargs["dtype"] = dtype | ||||
|
||||
if is_accelerate_version(">=", "1.9.0.dev0"): | ||||
a-r-r-o-w marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
set_module_kwargs["non_blocking"] = True | ||||
set_module_kwargs["_empty_cache"] = False | ||||
|
||||
# For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model, and which | ||||
# uses `param.copy_(input_param)` that preserves the contiguity of the parameter in the model. | ||||
# Reference: https://github.com/pytorch/pytorch/blob/db79ceb110f6646523019a59bbd7b838f43d4a86/torch/nn/modules/module.py#L2040C29-L2040C29 | ||||
|
@@ -520,3 +526,65 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): | |||
parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights | ||||
|
||||
return parsed_parameters | ||||
|
||||
|
||||
def _find_mismatched_keys( | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Taken out of here: diffusers/src/diffusers/models/modeling_utils.py Line 1509 in 9f4d997
|
||||
state_dict, | ||||
model_state_dict, | ||||
loaded_keys, | ||||
ignore_mismatched_sizes, | ||||
): | ||||
mismatched_keys = [] | ||||
if not ignore_mismatched_sizes: | ||||
return mismatched_keys | ||||
for checkpoint_key in loaded_keys: | ||||
model_key = checkpoint_key | ||||
# If the checkpoint is sharded, we may not have the key here. | ||||
if checkpoint_key not in state_dict: | ||||
continue | ||||
|
||||
if model_key in model_state_dict and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape: | ||||
mismatched_keys.append( | ||||
(checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape) | ||||
) | ||||
del state_dict[checkpoint_key] | ||||
return mismatched_keys | ||||
|
||||
|
||||
def _expand_device_map(device_map, param_names): | ||||
""" | ||||
Expand a device map to return the correspondence parameter name to device. | ||||
""" | ||||
new_device_map = {} | ||||
for module, device in device_map.items(): | ||||
new_device_map.update( | ||||
{p: device for p in param_names if p == module or p.startswith(f"{module}.") or module == ""} | ||||
) | ||||
return new_device_map | ||||
|
||||
|
||||
# Adapted from: https://github.com/huggingface/transformers/blob/0687d481e2c71544501ef9cb3eef795a6e79b1de/src/transformers/modeling_utils.py#L5859 | ||||
def _caching_allocator_warmup(model, expanded_device_map: Dict[str, torch.device], dtype: torch.dtype) -> None: | ||||
a-r-r-o-w marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
""" | ||||
This function warm-ups the caching allocator based on the size of the model tensors that will reside on each | ||||
device. It allows to have one large call to Malloc, instead of recursively calling it later when loading the model, | ||||
which is actually the loading speed bottleneck. Calling this function allows to cut the model loading time by a | ||||
very large margin. | ||||
""" | ||||
# Remove disk and cpu devices, and cast to proper torch.device | ||||
accelerator_device_map = { | ||||
param: torch.device(device) | ||||
for param, device in expanded_device_map.items() | ||||
if str(device) not in ["cpu", "disk"] | ||||
} | ||||
parameter_count = defaultdict(lambda: 0) | ||||
for param_name, device in accelerator_device_map.items(): | ||||
try: | ||||
param = model.get_parameter(param_name) | ||||
except AttributeError: | ||||
param = model.get_buffer(param_name) | ||||
parameter_count[device] += math.prod(param.shape) | ||||
|
||||
# This will kick off the caching allocator to avoid having to Malloc afterwards | ||||
for device, param_count in parameter_count.items(): | ||||
_ = torch.empty(param_count, dtype=dtype, device=device, requires_grad=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -184,5 +184,14 @@ def get_device(): | |
def empty_device_cache(device_type: Optional[str] = None): | ||
if device_type is None: | ||
device_type = get_device() | ||
if device_type in ["cpu"]: | ||
return | ||
device_mod = getattr(torch, device_type, torch.cuda) | ||
device_mod.empty_cache() | ||
|
||
|
||
def device_synchronize(device_type: Optional[str] = None): | ||
if device_type is None: | ||
device_type = get_device() | ||
device_mod = getattr(torch, device_type, torch.cuda) | ||
device_mod.synchronize() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess all different backends ought to have this method. Just flagging. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. afaik, synchronize should be available on all devices. Just the empty_cache function required a special check because it would fail if device was cpu |
Uh oh!
There was an error while loading. Please reload this page.