1M+ context length (context parallel integration) (#2668)

ebsmothers · web-flow · commit 430941981d79 · 2025-05-29T20:47:02.000-07:00
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -158,6 +158,7 @@ def __init__(self, cfg: DictConfig) -> None:
             raise ValueError(
                 "Tensor Parallel plan needs to be provided when tensor parallel is enabled."
             )
+        self.cp_degree = cfg.get("context_parallel_dim", 1)
         data_shard = cfg.get("data_parallel_shard_dim", -1)  # -1 means to infer
         data_replicate = cfg.get("data_parallel_replicate_dim", 1)
 
@@ -166,6 +167,7 @@ def __init__(self, cfg: DictConfig) -> None:
             dp_replicate=data_replicate,
             dp_shard=data_shard,
             tp=self.tp_degree,
+            cp=self.cp_degree,
             world_size=self.world_size,
         )
         self.world_mesh = self.parallel_dims.build_mesh(device_type=device_type)
@@ -603,6 +605,10 @@ def _setup_model(
                     "FP8 training does not support tensor parallelism yet. "
                     "This will be enabled in the near future."
                 )
+            if self.cp_degree > 1:
+                raise ValueError(
+                    "Context Parallel for fp8 training is not currently supported"
+                )
             model = convert_to_float8_training(model, self._fp8_recipe_name)
 
         # Apply tensor parallelism to the model
@@ -665,6 +671,13 @@ def _setup_model(
                 dp_mesh=self.world_mesh[dp_mesh_dim_names],
             )
 
+        # Define context manager for context parallelism
+        self.context_parallel_manager = training.get_context_parallel_manager(
+            enabled=self.cp_degree > 1,
+            world_mesh=self.world_mesh,
+            model=model,
+        )
+
         with training.set_default_dtype(self._dtype), self._device:
             for m in model.modules():
                 # RoPE is not covered in state dict
@@ -797,7 +810,7 @@ def _setup_data(
                     collate_fn,
                     padding_idx=self._tokenizer.pad_id,
                     ignore_idx=self._loss_fn.ignore_index,
-                    pad_to_multiple_of=self.tp_degree,
+                    pad_to_multiple_of=self.parallel_dims.min_seq_len_divisor,
                 )
                 if not packed
                 else padded_collate_packed
@@ -920,17 +933,17 @@ def train(self) -> None:
 
                 # Loss is normalized by default so we multiply by the number of tokens
                 # This way we can normalize by the total number of tokens if we're accumulating gradients
-                current_loss = self._loss_step(batch) * current_num_tokens
-                running_loss += current_loss
-
-                # For optimizer in backward, we need to normalize before calling backward
-                # This case and gradient accumulation are mutually exclusive
-                if self._optimizer_in_bwd:
-                    torch.distributed.all_reduce(num_tokens)
-                    torch.distributed.all_reduce(running_loss)
-                    current_loss = current_loss * (self.dp_degree / num_tokens)
+                with self.context_parallel_manager(list(batch.values())):
+                    current_loss = self._loss_step(batch) * current_num_tokens
+                    running_loss += current_loss
+                    # For optimizer in backward, we need to normalize before calling backward
+                    # This case and gradient accumulation are mutually exclusive
+                    if self._optimizer_in_bwd:
+                        torch.distributed.all_reduce(num_tokens)
+                        torch.distributed.all_reduce(running_loss)
+                        current_loss = current_loss * (self.dp_degree / num_tokens)
+                    current_loss.backward()
 
-                current_loss.backward()
                 # Optimizer step (if not fused in backward call)
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
                     if not self._optimizer_in_bwd:
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -168,6 +168,7 @@ def __init__(self, cfg: DictConfig) -> None:
             raise ValueError(
                 "Tensor Parallel plan needs to be provided when tensor parallel is enabled."
             )
+        self.cp_degree = cfg.get("context_parallel_dim", 1)
         data_shard = cfg.get("data_parallel_shard_dim", -1)  # -1 means to infer
         data_replicate = cfg.get("data_parallel_replicate_dim", 1)
 
@@ -176,6 +177,7 @@ def __init__(self, cfg: DictConfig) -> None:
             dp_replicate=data_replicate,
             dp_shard=data_shard,
             tp=self.tp_degree,
+            cp=self.cp_degree,
             world_size=self.world_size,
         )
         self.world_mesh = self.parallel_dims.build_mesh(device_type=device_type)
@@ -670,6 +672,13 @@ def _setup_model(
                 dp_mesh=self.world_mesh[dp_mesh_dim_names],
             )
 
+        # Define context manager for context parallelism
+        self.context_parallel_manager = training.get_context_parallel_manager(
+            enabled=self.cp_degree > 1,
+            world_mesh=self.world_mesh,
+            model=model,
+        )
+
         with training.set_default_dtype(self._dtype), self._device:
             for m in model.modules():
                 # RoPE is not covered in state dict
@@ -802,7 +811,7 @@ def _setup_data(
                     collate_fn,
                     padding_idx=self._tokenizer.pad_id,
                     ignore_idx=self._loss_fn.ignore_index,
-                    pad_to_multiple_of=self.tp_degree,
+                    pad_to_multiple_of=self.parallel_dims.min_seq_len_divisor,
                 )
                 if not packed
                 else padded_collate_packed
@@ -925,17 +934,19 @@ def train(self) -> None:
 
                 # Loss is normalized by default so we multiply by the number of tokens
                 # This way we can normalize by the total number of tokens if we're accumulating gradients
-                current_loss = self._loss_step(batch) * current_num_tokens
-                running_loss += current_loss
+                with self.context_parallel_manager(list(batch.values())):
+                    current_loss = self._loss_step(batch) * current_num_tokens
+                    running_loss += current_loss
+
+                    # For optimizer in backward, we need to normalize before calling backward
+                    # This case and gradient accumulation are mutually exclusive
+                    if self._optimizer_in_bwd:
+                        torch.distributed.all_reduce(num_tokens)
+                        torch.distributed.all_reduce(running_loss)
+                        current_loss = current_loss * (self.dp_degree / num_tokens)
 
-                # For optimizer in backward, we need to normalize before calling backward
-                # This case and gradient accumulation are mutually exclusive
-                if self._optimizer_in_bwd:
-                    torch.distributed.all_reduce(num_tokens)
-                    torch.distributed.all_reduce(running_loss)
-                    current_loss = current_loss * (self.dp_degree / num_tokens)
+                    current_loss.backward()
 
-                current_loss.backward()
                 # Optimizer step (if not fused in backward call)
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
                     if not self._optimizer_in_bwd:
diff --git a/torchtune/models/llama3_2/_model_builders.py b/torchtune/models/llama3_2/_model_builders.py
@@ -15,12 +15,14 @@
 the llama3_2_1b model builder uses the llama3_2 component builder to create the
 Llama3.2 1B model.
 """
+
+
 def llama3_2_1b(
     tie_word_embeddings: bool = True,
 ) -> TransformerDecoder:
     """
     Builder for creating a Llama3.2 model initialized w/ the default 1b parameter values.
-    
+
     Args:
         tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
 
@@ -41,6 +43,8 @@ def llama3_2_1b(
         scale_factor=32,
         tie_word_embeddings=tie_word_embeddings,
     )
+
+
 def llama3_2_3b(
     tie_word_embeddings: bool = True,
 ) -> TransformerDecoder:
@@ -67,6 +71,8 @@ def llama3_2_3b(
         scale_factor=32,
         tie_word_embeddings=tie_word_embeddings,
     )
+
+
 def lora_llama3_2_1b(
     lora_attn_modules: list[LORA_ATTN_MODULES],
     apply_lora_to_mlp: bool = False,
@@ -83,7 +89,7 @@ def lora_llama3_2_1b(
     The Llama3.2 defaults are the same as in :func:`~torchtune.models.llama3_2.llama3_2_1b`,
     while LoRA default params are based on
     https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
-    
+
     Args:
         lora_attn_modules (list[LORA_ATTN_MODULES]): list of which linear layers
             LoRA should be applied to in each self-attention block. Options are
@@ -125,6 +131,8 @@ def lora_llama3_2_1b(
         quantize_base=quantize_base,
         tie_word_embeddings=tie_word_embeddings,
     )
+
+
 def lora_llama3_2_3b(
     lora_attn_modules: list[LORA_ATTN_MODULES],
     apply_lora_to_mlp: bool = False,
@@ -161,7 +169,6 @@ def lora_llama3_2_3b(
     Returns:
         TransformerDecoder: Instantiation of Llama3.2 3B model with LoRA applied
     """
-           
     return lora_llama3_2(
         lora_attn_modules=lora_attn_modules,
         apply_lora_to_mlp=apply_lora_to_mlp,
@@ -184,6 +191,8 @@ def lora_llama3_2_3b(
         quantize_base=quantize_base,
         tie_word_embeddings=tie_word_embeddings,
     )
+
+
 qlora_llama3_2_1b = partial(lora_llama3_2_1b, quantize_base=True)
 qlora_llama3_2_1b.__doc__ = """
 Builder for creating a Llama3.2 1B model with QLoRA enabled. Base model weights in linear layers
diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py
@@ -11,6 +11,7 @@
 from torchtune.training._compile import compile_loss, compile_model
 from torchtune.training._distributed import (
     gather_cpu_state_dict,
+    get_context_parallel_manager,
     get_distributed_backend,
     get_full_optimizer_state_dict,
     get_shard_conditions,
@@ -145,4 +146,5 @@
     "get_distributed_backend",
     "disable_dropout",
     "DATALOADER_KEY",
+    "get_context_parallel_manager",
 ]
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py