pytorch
diff --git a/‎recipes/configs/llama4/scout_17B_16E_full.yaml
Lines changed: 20 additions & 7 deletions b/‎recipes/configs/llama4/scout_17B_16E_full.yaml
Lines changed: 20 additions & 7 deletions
diff --git a/‎recipes/full_finetune_distributed.py
Lines changed: 32 additions & 4 deletions b/‎recipes/full_finetune_distributed.py
Lines changed: 32 additions & 4 deletions
diff --git a/‎torchtune/modules/attention_utils.py
Lines changed: 1 addition & 1 deletion b/‎torchtune/modules/attention_utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtune/modules/moe/experts.py
Lines changed: 55 additions & 19 deletions b/‎torchtune/modules/moe/experts.py
Lines changed: 55 additions & 19 deletions
@@ -18,7 +18,7 @@ output_dir: /tmp/torchtune/llama4_17Bx16E/full
 model:
   _component_: torchtune.models.llama4.llama4_scout_17b_16e
 
-tensor_parallel_dim: 2 # For multi-node training we recommend tensor_parallel_dim: 8
+tensor_parallel_dim: 1 # For multi-node training we recommend tensor_parallel_dim: 8
 tensor_parallel_plan:
   _component_: torchtune.models.llama4.decoder_only_tp_plan
 data_parallel_shard_dim: -1 # Will infer based on TP dim, effectively controls FSDP
@@ -73,11 +73,11 @@ fsdp_cpu_offload: True
 # compile False means no torch.compile
 # compile Dictionary with keys: "model", "loss", "optimizer_step"
 # enables torch.compile only for specified components.
-compile: False
-#    model: True
-#    loss: True
-#    optimizer_step: False
-#    scale_grads: True
+compile: True
+  #    model: True
+  #    loss: True
+  #    optimizer_step: True
+  #    scale_grads: True
 
 # Reduced precision
 dtype: bf16
@@ -93,4 +93,17 @@ log_level: INFO  # DEBUG, WARN, etc.
 # Useful for understanding how to optimize memory and performance
 profiler:
   _component_: torchtune.training.setup_torch_profiler
-  enabled: False
+  enabled: True
+  output_dir: ${output_dir}/profiling_outputs
+  cpu: True
+  cuda: True
+  profile_memory: True
+  with_stack: True
+  record_shapes: True
+  with_flops: False
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 1
+  num_cycles: 1
+
+# enable_fp8_training: True
@@ -16,6 +16,7 @@
 from omegaconf import DictConfig, ListConfig
 
 from torch import nn
+import torch.distributed as dist
 from torch.distributed import destroy_process_group, init_process_group
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.parallel import parallelize_module
@@ -147,7 +148,10 @@ def __init__(self, cfg: DictConfig) -> None:
             offload_ops_to_cpu=self.fsdp_cpu_offload
             or self._enable_async_checkpointing,
         )
-        init_process_group(self.distributed_backend)
+        group_name = "torchtune-finetune"
+        init_process_group(self.distributed_backend, group_name=group_name)
+        pg = dist.distributed_c10d._get_default_group()
+        torch._C._distributed_c10d._register_process_group(group_name, pg)
 
         # Initialize distributed variables
         self.world_size, self.rank = utils.get_world_size_and_rank()
@@ -328,6 +332,8 @@ def setup(self, cfg: DictConfig) -> None:
         compile = cfg.get("compile")
         compile_bool = bool(compile)
         self._compile_backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
+        self._compile_mode = None # "max-autotune-no-cudagraphs"
+        torch._inductor.config.cpp_wrapper = True
 
         self._compile_model = compile_bool
         self._compile_loss = compile_bool
@@ -343,7 +349,7 @@ def setup(self, cfg: DictConfig) -> None:
         self._grad_scaler = training.scale_grads_
         if self._compile_scale_grads:
             self._grad_scaler = torch.compile(
-                self._grad_scaler, backend=self._compile_backend
+                self._grad_scaler, backend=self._compile_backend, mode=self._compile_mode
             )
 
         self._model = self._setup_model(
@@ -380,6 +386,7 @@ def setup(self, cfg: DictConfig) -> None:
             self._optimizer.step = torch.compile(
                 self._optimizer.step,
                 backend=self._compile_backend,
+                mode=self._compile_mode
             )
 
         if self._resume_from_checkpoint:
@@ -413,7 +420,7 @@ def setup(self, cfg: DictConfig) -> None:
             self._loss_fn.set_model_output(self._model)
 
         if self._compile_loss:
-            training.compile_loss(self._loss_fn, verbose=self._is_rank_zero)
+            training.compile_loss(self._loss_fn, mode=self._compile_mode, verbose=self._is_rank_zero)
 
         utils.log_rank_zero(self._logger, "Loss is initialized.")
 
@@ -586,7 +593,7 @@ def _setup_model(
             model = config.instantiate(cfg_model)
 
         if self._compile_model:
-            training.compile_model(model, verbose=self._is_rank_zero)
+            training.compile_model(model, mode=self._compile_mode, verbose=self._is_rank_zero)
 
         if self._enable_fp8_training:
             # Requires https://github.com/pytorch/pytorch/pull/148922
@@ -1068,6 +1075,26 @@ def cleanup(self) -> None:
             self._metric_logger.close()
         destroy_process_group()
 
+# from torch.utils._python_dispatch import TorchDispatchMode
+# import torch.utils._pytree as pytree
+# from torch._higher_order_ops.flex_attention import flex_attention
+# 
+# 
+# 
+# class Mode(TorchDispatchMode):
+#     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+#         r = torch.distributed.get_rank()
+#         print(f"XXX RANK[{r}] MODE._torch_dispatch_ {func} {types}") 
+#         for a in pytree.tree_leaves(args):
+#             if issubclass(type(a), torch.Tensor):
+#                 print(f"XXX RANK[{r}] {a.dtype} {a.shape}")
+#             else:
+#                 print(f"XXX RANK[{r}] {a}")
+#         return func(*args, **kwargs)
+# 
+# def flex_attention_mode_call(mode, *args, **kwargs):
+#     return flex_attention(*args, **kwargs)
+# flex_attention.py_impl(Mode)(flex_attention_mode_call)
 
 @config.parse
 def recipe_main(cfg: DictConfig) -> None:
@@ -1081,6 +1108,7 @@ def recipe_main(cfg: DictConfig) -> None:
     config.log_config(recipe_name="FullFinetuneRecipeDistributed", cfg=cfg)
     recipe = FullFinetuneRecipeDistributed(cfg=cfg)
     recipe.setup(cfg=cfg)
+    # with Mode():
     recipe.train()
     recipe.cleanup()
 
 
@@ -47,7 +47,7 @@ def compile_flex_attention():
     # when compiled. To insulate it from the compiler, we wrap it with
     # compiler.disable so that it can be used regardless of whether the model
     # is compiled or not, and flex attention always remains compiled.
-    @torch.compiler.disable(recursive=False)
+    # @torch.compiler.disable(recursive=False)
     def compile_friendly_flex_attention(
         q: torch.Tensor,
         k: torch.Tensor,
 
@@ -11,8 +11,13 @@
 from torch import nn
 from torch.nn import functional as F
 from torchtune.modules.peft import AdapterModule
+from torchtune.modules.moe.moe import USE_GROUPED_MM
 
 
+@torch._dynamo.allow_in_graph
+def _grouped_mm(x, w, offs):
+    return torch._grouped_mm(x, w, offs=offs)
+
 class GroupedExperts(nn.Module):
     """This class implements the grouped experts layer used in Mixture of Experts. Each expert
     is a variant of the Gated Linear Units network. See more details in https://arxiv.org/pdf/2002.05202.
@@ -50,6 +55,7 @@ def reset_parameters(self) -> None:
     # TODO: force no inference mode as a hack to get around
     # "Cannot set version_counter for inference tensor"
     @torch.inference_mode(mode=False)
+    # @torch._dynamo.disable(recursive=False)
     def forward(
         self,
         x: torch.Tensor,
@@ -64,27 +70,57 @@ def forward(
         Returns:
             torch.Tensor: tensor with shape ``(bsz * seq_len * experts_per_token, dim)``
         """
+        self.use_grouped_mm = USE_GROUPED_MM
+        if not self.use_grouped_mm:
+            # a tuple of tensors indexed by experts
+            # each with shape (tokens_per_expert(varying), dim)
+            x = torch.split(
+                x,
+                split_size_or_sections=num_tokens_per_expert.tolist(),
+                dim=0,
+            )
+            out_experts_splits = []
+            for expert_idx, x_expert in enumerate(x):
+                w1, w2, w3 = (
+                    self.gate_proj[expert_idx],
+                    self.down_proj[expert_idx],
+                    self.up_proj[expert_idx],
+                )
+                h = self.act_fn(torch.matmul(x_expert, w1))
+                h = h * torch.matmul(x_expert, w3)
+                h = torch.matmul(h, w2)
+                # h shape (tokens_per_expert(varying), dim)
+                out_experts_splits.append(h)
+            out = torch.cat(out_experts_splits, dim=0)
 
-        # a tuple of tensors indexed by experts
-        # each with shape (tokens_per_expert(varying), dim)
-        x = torch.split(
-            x,
-            split_size_or_sections=num_tokens_per_expert.tolist(),
-            dim=0,
-        )
-        out_experts_splits = []
-        for expert_idx, x_expert in enumerate(x):
-            w1, w2, w3 = (
-                self.gate_proj[expert_idx],
-                self.down_proj[expert_idx],
-                self.up_proj[expert_idx],
+            return out
+
+        # grouped mm implementation
+        if num_tokens_per_expert is not None:
+            # https://github.com/pytorch/pytorch/pull/150374
+            # NOTE: torch._gouped_mm requires bf16 dtypes
+            #       and shapes to be multiple of 8
+            offsets = torch.cumsum(
+                num_tokens_per_expert, dim=0, dtype=torch.int32
             )
-            h = self.act_fn(torch.matmul(x_expert, w1))
-            h = h * torch.matmul(x_expert, w3)
-            h = torch.matmul(h, w2)
-            # h shape (tokens_per_expert(varying), dim)
-            out_experts_splits.append(h)
-        out = torch.cat(out_experts_splits, dim=0)
+            # grouped mm between a 2D tensor and a 3D tensor
+            assert x.dim() == 2
+        else:
+            offsets = None
+            # fall back to regular bmm between 3D tensors
+            assert x.dim() == 3
+
+        w1, w2, w3 = (
+            self.gate_proj,
+            self.down_proj,
+            self.up_proj,
+        )
+        assert (
+            x.dtype == w1.dtype == w2.dtype == w3.dtype == torch.bfloat16
+        ), "torch._grouped_mm only supports bf16 dtypes"
+        h = F.silu(_grouped_mm(x, w1, offs=offsets))
+        h = h * _grouped_mm(x, w3, offs=offsets)
+        out = _grouped_mm(h, w2, offs=offsets)
 
         return out