feat: support fsdpv2 and fsdpv2 + tp

kmehant · kmehant · commit 71271d17ecb2 · 2025-03-20T15:34:09.000+05:30
Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -405,10 +405,12 @@ def __init__(
             if not isinstance(torch_tp_plugin, TorchTensorParallelPlugin):
                 raise TypeError("`torch_tp_plugin` must be a TorchTensorParallelPlugin object.")
             os.environ["ACCELERATE_USE_TP"] = "true"
-            
+
         if fsdp2_plugin is None:
             fsdp2_plugin = (
-                FullyShardedDataParallelPlugin2() if os.environ.get("ACCELERATE_USE_FSDP2", "false") == "true" else None
+                FullyShardedDataParallelPlugin2()
+                if os.environ.get("ACCELERATE_USE_FSDP2", "false") == "true"
+                else None
             )
         else:
             if not isinstance(fsdp2_plugin, FullyShardedDataParallelPlugin2):
@@ -1513,7 +1515,15 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
         elif device_placement and not self.verify_device_map(model):
             model = model.to(self.device)
         if not evaluation_mode:
-            device_mesh = prepare_nd_device_mesh(self.state.torch_tp_plugin.tp_size if self.state.torch_tp_plugin is not None else 1, self.state.fsdp2_plugin is not None)
+            # motivation behind preparing device mesh at the start is to easily extend
+            # device preparation for any combination of parallelisms and pass it on
+            # neatly to respective parallelism distribution code snippets.
+            # function prepare_nd_device_mesh should be enough to extend logic for future combinations
+            # for now prepare_nd_device_mesh handles any combination of TP and FSDP/HSDP
+            device_mesh = prepare_nd_device_mesh(
+                self.state.torch_tp_plugin.tp_size if self.state.torch_tp_plugin is not None else 1,
+                self.state.fsdp2_plugin is not None,
+            )
             if self.distributed_type in (
                 DistributedType.MULTI_GPU,
                 DistributedType.MULTI_MLU,
@@ -1548,6 +1558,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
             if self.distributed_type == DistributedType.FSDP2 or self.distributed_type == DistributedType.FSDP2_TP:
                 self.state.fsdp2_plugin.torch_device_mesh = device_mesh["dp", "fsdp"]
                 from torch.distributed._composable.fsdp import fully_shard, FSDPModule
+
                 # Check if the model is already a FSDP model due to `Manual Wrapping` and if so,
                 # don't wrap it again
                 # In case the model is already compiled using PyTorch 2.0 and the wrapped model in it
@@ -1558,7 +1569,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 )
 
                 if not is_type_fsdp:
-                    fsdp2_kwargs  = {
+                    fsdp2_kwargs = {
                         "mp_policy": self.state.fsdp2_plugin.mp_policy,
                         "reshard_after_forward": self.state.fsdp2_plugin.reshard_after_forward,
                         "offload_policy": self.state.fsdp2_plugin.offload_policy,
@@ -1570,25 +1581,14 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                     for layer in model.model.layers:
                         fully_shard(layer, **fsdp2_kwargs)
                     fully_shard(model, **fsdp2_kwargs)
+                    # if the previous and current models are same, delete the previous one
+                    if len(self._models) > 1 and (self._models[-2] is self._models[-1]):
+                        del self._models[-2]
+                    self._models[-1] = model
 
                     #######
-                    # does existing activation_checkpointing API work out of the box with FSDP2?
+                    # TODO: support activation_checkpointing for FSDP2 and nd parallel cases
                     #######
-                    # if fsdp_plugin.activation_checkpointing:
-                    #     from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-                    #         CheckpointImpl,
-                    #         apply_activation_checkpointing,
-                    #         checkpoint_wrapper,
-                    #     )
-
-                    #     apply_activation_checkpointing(
-                    #         model,
-                    #         checkpoint_wrapper_fn=functools.partial(
-                    #             checkpoint_wrapper,
-                    #             checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-                    #         ),
-                    #         auto_wrap_policy=fsdp_plugin.auto_wrap_policy,
-                    #     )
 
             elif self.distributed_type == DistributedType.FSDP:
                 # We need to fix the optimizer *before* sharding the model
@@ -2227,7 +2227,10 @@ def prepare_data_loader(
         if device_placement is None:
             device_placement = self.device_placement if self.distributed_type != DistributedType.XLA else False
 
-        nd_device_mesh = prepare_nd_device_mesh(self.state.torch_tp_plugin.tp_size if self.state.torch_tp_plugin is not None else 1, self.state.fsdp2_plugin is not None)
+        nd_device_mesh = prepare_nd_device_mesh(
+            self.state.torch_tp_plugin.tp_size if self.state.torch_tp_plugin is not None else 1,
+            self.state.fsdp2_plugin is not None,
+        )
         prepared_data_loader = prepare_data_loader(
             data_loader,
             self.device,
@@ -2497,7 +2500,9 @@ def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
             parameters = [p for p in parameters]
             for model in self._models:
                 if parameters == [p for p in model.parameters()]:
-                    return torch.nn.utils.clip_grad_norm_(parameters=parameters, max_norm=max_norm, norm_type=norm_type)
+                    return torch.nn.utils.clip_grad_norm_(
+                        parameters=parameters, max_norm=max_norm, norm_type=norm_type
+                    )
         elif self.distributed_type == DistributedType.DEEPSPEED:
             # `accelerator.backward(loss)` is doing that automatically. Therefore, its implementation is not needed
             # We cannot return the gradient norm because DeepSpeed does it.
diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -612,7 +612,9 @@ def launch_command_parser(subparsers=None):
     )
 
     # fsdp2 args
-    fsdp2_args = parser.add_argument_group("FSDP2 Arguments", "Arguments related to Fully Shared Data Parallelism Version 2.")
+    fsdp2_args = parser.add_argument_group(
+        "FSDP2 Arguments", "Arguments related to Fully Shared Data Parallelism Version 2."
+    )
     fsdp2_args.add_argument(
         "--fsdp2_reshard_after_forward",
         default="true",
@@ -1059,6 +1061,7 @@ def _validate_launch_command(args):
             and not args.tpu_use_cluster
             and not args.use_deepspeed
             and not args.use_fsdp
+            and not args.use_fsdp2
             and not args.use_tp
             and not args.use_megatron_lm
         ):
@@ -1078,6 +1081,7 @@ def _validate_launch_command(args):
             args.tpu = defaults.distributed_type == DistributedType.XLA
             args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
             args.use_tp = defaults.distributed_type == DistributedType.TP
+            args.use_fsdp2 = defaults.distributed_type == DistributedType.FSDP2
             args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
             args.tpu_use_cluster = defaults.tpu_use_cluster if args.tpu else False
         if args.gpu_ids is None:
@@ -1237,6 +1241,8 @@ def launch_command(args):
         deepspeed_launcher(args)
     elif args.use_fsdp and not args.cpu:
         multi_gpu_launcher(args)
+    elif args.use_fsdp2 and not args.cpu:
+        multi_gpu_launcher(args)
     elif args.use_tp and not args.cpu:
         multi_gpu_launcher(args)
     elif args.use_megatron_lm and not args.cpu:
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -1850,13 +1850,12 @@ class FullyShardedDataParallelPlugin2:
     reshard_after_forward: Optional[bool] = field(
         default=None,
         metadata={
-            "help": 
-                "If reshard_after_forward is True, the parameters are sharded on every forward pass and all-gathered during backward pass."
-                "reshard_after_forward in conjunction with device mesh dimension would mean different strategies like the following:"
-                "reshard_after_forward=True and 1D device mesh mean full shard"
-                "reshard_after_forward=True and 2D device mesh mean hybrid shard"
-                "reshard_after_forward=False and 1D device mesh mean shard grad and optimizer states only"
-                "reshard_after_forward=False and 2D device mesh mean hybrid shard with grad and optim sharding only"
+            "help": "If reshard_after_forward is True, the parameters are sharded on every forward pass and all-gathered during backward pass."
+            "reshard_after_forward in conjunction with device mesh dimension would mean different strategies like the following:"
+            "reshard_after_forward=True and 1D device mesh mean full shard"
+            "reshard_after_forward=True and 2D device mesh mean hybrid shard"
+            "reshard_after_forward=False and 1D device mesh mean shard grad and optimizer states only"
+            "reshard_after_forward=False and 2D device mesh mean hybrid shard with grad and optim sharding only"
         },
     )
     offload_policy: Optional[Union[dict, "torch.distributed._composable.OffloadPolicy"]] = field(
@@ -1865,45 +1864,45 @@ class FullyShardedDataParallelPlugin2:
             "help": "A config to enable CPU offload. If passing in a `dict`, it should have the following key: `pin_memory`."
         },
     )
-    mp_policy: Optional[Union[dict, "torch.distributed._composable.MixedPrecisionPolicy"]] = (
-        field(
-            default=None,
-            metadata={
-                "help": "A config to enable mixed precision training with FullyShardedDataParallelv2. If passing in a `dict`, it"
+    mp_policy: Optional[Union[dict, "torch.distributed._composable.MixedPrecisionPolicy"]] = field(
+        default=None,
+        metadata={
+            "help": "A config to enable mixed precision training with FullyShardedDataParallelv2. If passing in a `dict`, it"
             "should have the following keys: `param_dtype`, `reduce_dtype`, `output_dtype`, and `cast_forward_inputs`. "
-            },
-        )
+        },
     )
     ignored_params: Optional[set["torch.nn.Parameter"]] = field(
         default=None,
-        metadata={
-            "help": "The set of parameters that we don't want to shard with FSDP."
-        },
+        metadata={"help": "The set of parameters that we don't want to shard with FSDP."},
     )
 
     def __post_init__(self):
         env_prefix = "FSDP2_"
         if self.reshard_after_forward is None:
             self.reshard_after_forward = str_to_bool(os.environ.get(env_prefix + "RESHARD_AFTER_FORWARD", "True")) == 1
-        
+
         self.set_offload_policy()
         self.set_mp_policy()
 
         from torch.distributed.device_mesh import init_device_mesh
+
         dp_mesh_dim_name = "dp"
         fsdp_mesh_dim_name = "fsdp"
         device = "cuda"  # support for other devices has to be investigated
         num_nodes = torch.distributed.get_world_size() // torch.cuda.device_count()
         nproc_per_node = torch.cuda.device_count()
-        self.torch_device_mesh = init_device_mesh(device, (num_nodes,nproc_per_node), mesh_dim_names=(dp_mesh_dim_name, fsdp_mesh_dim_name))
+        self.torch_device_mesh = init_device_mesh(
+            device, (num_nodes, nproc_per_node), mesh_dim_names=(dp_mesh_dim_name, fsdp_mesh_dim_name)
+        )
 
     def set_offload_policy(self, pin_memory=None):
         """
         Set the offload policy
         """
         from torch.distributed._composable.fsdp import CPUOffloadPolicy
+
         env_prefix = "FSDP2_"
-        
+
         if self.offload_policy is None:
             fsdp2_cpu_offload = str_to_bool(os.environ.get(env_prefix + "CPU_OFFLOAD", "False")) == 1
             if fsdp2_cpu_offload:
@@ -1918,6 +1917,7 @@ def set_mp_policy(self, param_dtype=None, reduce_dtype=None, output_dtype=None,
         Set mixed precision policy
         """
         from torch.distributed._composable.fsdp import MixedPrecisionPolicy
+
         env_prefix = "FSDP2_"
         mixed_precision_mapping = {
             "fp8": torch.bfloat16,
@@ -1926,18 +1926,22 @@ def set_mp_policy(self, param_dtype=None, reduce_dtype=None, output_dtype=None,
             "fp32": torch.float32,
         }
 
-        # current_env["FSDP2_MP_PARAM_DTYPE"] = str(args.fsdp2_mp_param_dtype).lower()
-        # current_env["FSDP2_MP_REDUCE_DTYPE"] = str(args.fsdp2_mp_reduce_dtype).lower()
-        # current_env["FSDP2_MP_OUTPUT_DTYPE"] = str(args.fsdp2_mp_output_dtype).lower()
-        # current_env["FSDP2_CAST_FORWARD_INPUTS"] = str(args.fsdp2_cast_forward_inputs).lower()
-        
         if self.mp_policy is None:
             param_dtype = mixed_precision_mapping.get(os.environ.get(env_prefix + "MP_PARAM_DTYPE", param_dtype), None)
-            reduce_dtype = mixed_precision_mapping.get(os.environ.get(env_prefix + "MP_REDUCE_DTYPE", reduce_dtype), None)
-            output_dtype = mixed_precision_mapping.get(os.environ.get(env_prefix + "MP_OUTPUT_DTYPE", output_dtype), None)
+            reduce_dtype = mixed_precision_mapping.get(
+                os.environ.get(env_prefix + "MP_REDUCE_DTYPE", reduce_dtype), None
+            )
+            output_dtype = mixed_precision_mapping.get(
+                os.environ.get(env_prefix + "MP_OUTPUT_DTYPE", output_dtype), None
+            )
             if not cast_forward_inputs:
                 cast_forward_inputs = str_to_bool(os.environ.get(env_prefix + "CAST_FORWARD_INPUTS", "False")) == 1
-            self.mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype, output_dtype=output_dtype, cast_forward_inputs=cast_forward_inputs)
+            self.mp_policy = MixedPrecisionPolicy(
+                param_dtype=param_dtype,
+                reduce_dtype=reduce_dtype,
+                output_dtype=output_dtype,
+                cast_forward_inputs=cast_forward_inputs,
+            )
 
         if isinstance(self.mp_policy, dict):
             self.mp_policy = MixedPrecisionPolicy(**self.mp_policy)
@@ -1973,47 +1977,6 @@ def __post_init__(self):
         self.torch_device_mesh = init_device_mesh(device, (self.tp_size,), mesh_dim_names=(mesh_dim_name,))
 
 
-@dataclass
-class DeviceMeshHandler:
-    """
-    This handler is used to create and hold device mesh state throughout the training 
-    and dynamically support any combination of parallelisms.
-    """
-
-    tp_size: int = field(
-        default=1,
-        metadata={"help": "tensor parallel size will be used in the device mesh preparation with other parallelisms."},
-    )
-    
-    use_fsdp: bool = field(
-        default=False,
-        metadata={"help": "fsdp v2 will be used with other parallelisms for device mesh preparation."},
-    )
-
-    torch_device_mesh: Optional["torch.distributed.DeviceMesh"] = field(default=None)
-
-    def __post_init__(self):
-        self.tp_size = self.tp_size if os.environ.get("TP_SIZE", "1") == "1" else int(os.environ.get("TP_SIZE", "1"))
-        self.use_fsdp = self.use_fsdp or str_to_bool(os.environ.get("ACCELERATE_USE_FSDP2", "False")) == 1
-        if self.tp_size == 1:
-            raise ValueError("Provide TP degree > 1.")
-
-        if is_torch_version("<", BETA_TP_AVAILABLE_PYTORCH_VERSION):
-            raise ValueError(
-                f"Minimum PyTorch version {BETA_TP_AVAILABLE_PYTORCH_VERSION} needed to use tensor parallel."
-            )
-        from torch.distributed.device_mesh import init_device_mesh
-
-        mesh_dim_names = ("tp",)
-        mesh_dims = (self.tp_size,)
-        if self.use_fsdp:
-            num_nodes = torch.distributed.get_world_size() // torch.cuda.device_count()
-            nproc_per_node = torch.cuda.device_count()
-            mesh_dim_names = ("dp", "fsdp",) + mesh_dim_names
-            mesh_dims = (num_nodes, nproc_per_node, ) + mesh_dims
-        device = "cuda"  # support for other devices has to be investigated
-        self.torch_device_mesh = init_device_mesh(device, mesh_dims, mesh_dim_names=mesh_dim_names)
-
 @dataclass
 class MegatronLMPlugin:
     """
diff --git a/src/accelerate/utils/pytorch_utils.py b/src/accelerate/utils/pytorch_utils.py
@@ -1,10 +1,12 @@
 import torch
 
+
 def prepare_nd_device_mesh(tp_size=1, use_fsdp=False):
     """Returns a multi dimensional device mesh.
     Extend this function to support various combinations of parallelisms.
     """
     from torch.distributed.device_mesh import init_device_mesh
+
     mesh_dim_names = ()
     mesh_dims = ()
     if tp_size <= 1 and not use_fsdp:
@@ -15,7 +17,13 @@ def prepare_nd_device_mesh(tp_size=1, use_fsdp=False):
     if use_fsdp:
         num_nodes = torch.distributed.get_world_size() // torch.cuda.device_count()
         nproc_per_node = torch.cuda.device_count()
-        mesh_dim_names = ("dp", "fsdp",) + mesh_dim_names
-        mesh_dims = (num_nodes, nproc_per_node, ) + mesh_dims
+        mesh_dim_names = (
+            "dp",
+            "fsdp",
+        ) + mesh_dim_names
+        mesh_dims = (
+            num_nodes,
+            nproc_per_node // tp_size,
+        ) + mesh_dims
     device = "cuda"
     return init_device_mesh(device, mesh_dims, mesh_dim_names=mesh_dim_names)