Working version

S1ro1 · S1ro1 · commit 1dff78af0a9e · 2025-05-28T13:27:44.000Z
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -1404,17 +1404,17 @@ def prepare(self, *args, device_placement=None):
         fsdp2_should_fix_optimizer = self.is_fsdp2
         should_fix_optimizer = tpu_should_fix_optimizer or fsdp2_should_fix_optimizer
 
-        # We need to specifically prepare AO (possibly other FP8 backends, haven't tested yet) here, as fsdp2 is very picky about the order of preparation
+        # We need to specifically prepare AO (possibly oter FP8 backends, haven't tested yet) here, as fsdp2 is very picky about the order of preparation
+        if self.is_fsdp2 and self.fp8_backend == "AO":
+            args = self._prepare_ao(*args)
 
+        # Compile needs to be done before gathering old params: investigate why?
         if self.is_fsdp2 and model_index is not None:
             new_args = list(args)
 
             new_args[model_index] = compile_regions(new_args[model_index])
             args = tuple(new_args)
 
-        if self.is_fsdp2 and self.fp8_backend == "AO":
-            args = self._prepare_ao(*args)
-
         if should_fix_optimizer:
             # 1. grabbing old model parameters
             old_named_params = self._get_named_parameters(
@@ -1425,6 +1425,7 @@ def prepare(self, *args, device_placement=None):
         # however that goes against `Accelerate's` design of `bring your own`
         # this is a workaround to make memory footprint match if `Optimizer` is created before preparing the model
         if fsdp2_should_fix_optimizer:
+            old_named_params = fsdp2_canonicalize_names(old_named_params)
             for obj in args:
                 if isinstance(obj, torch.optim.Optimizer):
                     for param_group in obj.param_groups:
@@ -1758,11 +1759,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
         if self.delayed_fp8_autocast:
             model = apply_fp8_autowrap(model, self.te_recipe_handler or self.fp8_recipe_handler)
         # torch.compile should be called last and only if the model isn't already compiled
-        if (
-            self.state.dynamo_plugin.backend != DynamoBackend.NO
-            and not is_compiled_module(model)
-            and not self.is_fsdp2
-        ):
+        if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
             if self.state.dynamo_plugin.use_regional_compilation:
                 model = compile_regions(model, **self.state.dynamo_plugin.to_kwargs())
             else:
@@ -3574,6 +3571,7 @@ def clear(self, *objects):
 
     def _get_named_parameters(self, *args, drop_refs=False):
         named_parameters = {}
+        accessor_mapping = {}
         for obj in args:
             if isinstance(obj, torch.nn.Module):
                 obj = extract_model_from_parallel(obj)
@@ -3583,9 +3581,7 @@ def _get_named_parameters(self, *args, drop_refs=False):
                 if self.fp8_backend == "AO":
                     from torchao.float8.fsdp_utils import WeightWithDynamicFloat8CastTensor
 
-                    accessor_mapping = {
-                        WeightWithDynamicFloat8CastTensor: "_tensor",
-                    }
+                    accessor_mapping[WeightWithDynamicFloat8CastTensor] = "_tensor"
 
                 named_parameters.update(
                     {
diff --git a/src/accelerate/utils/fsdp_utils.py b/src/accelerate/utils/fsdp_utils.py
@@ -27,7 +27,7 @@
 from .constants import FSDP_MODEL_NAME, OPTIMIZER_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_NAME
 from .dataclasses import get_module_class_from_name
 from .modeling import get_non_persistent_buffers, is_peft_model
-from .other import get_module_children_bottom_up, is_compiled_module, save
+from .other import compile_regions, get_module_children_bottom_up, is_compiled_module, save
 from .versions import is_torch_version
 
 
@@ -615,6 +615,10 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
         "mp_policy": fsdp2_plugin.mixed_precision_policy or MixedPrecisionPolicy(),
     }
 
+    # This is slow and high mem usage???
+    # model = torch.compile(model)
+    # model = compile_regions(model)
+
     model_has_params4bit = False
     for name, param in model.named_parameters():
         # this is a temporary fix whereby loading models with bnb params cannot be moved from
@@ -769,4 +773,5 @@ def fsdp2_canonicalize_names(named_params: dict) -> dict:
     named_params = {
         k.replace("_orig_mod.", "") if k.startswith("_orig_mod.") else k: v for k, v in named_params.items()
     }
+    named_params = {k.replace("._orig_mod", ""): v for k, v in named_params.items()}
     return named_params