WIP-DEBUG-PROFILE torch.compile

IvanKobzarev · IvanKobzarev · commit a488dd49906c · 2025-05-02T00:46:46.000-07:00
ghstack-source-id: 60b1db8 Pull Request resolved: #2644
diff --git a/recipes/configs/llama4/scout_17B_16E_full.yaml b/recipes/configs/llama4/scout_17B_16E_full.yaml
@@ -18,7 +18,7 @@ output_dir: /tmp/torchtune/llama4_17Bx16E/full
 model:
   _component_: torchtune.models.llama4.llama4_scout_17b_16e
 
-tensor_parallel_dim: 2 # For multi-node training we recommend tensor_parallel_dim: 8
+tensor_parallel_dim: 1 # For multi-node training we recommend tensor_parallel_dim: 8
 tensor_parallel_plan:
   _component_: torchtune.models.llama4.decoder_only_tp_plan
 data_parallel_shard_dim: -1 # Will infer based on TP dim, effectively controls FSDP
@@ -73,10 +73,10 @@ fsdp_cpu_offload: True
 # compile False means no torch.compile
 # compile Dictionary with keys: "model", "loss", "optimizer_step"
 # enables torch.compile only for specified components.
-compile: False
+compile: True
 #    model: True
 #    loss: True
-#    optimizer_step: False
+#    optimizer_step: True
 #    scale_grads: True
 
 # Reduced precision
@@ -92,4 +92,15 @@ log_peak_memory_stats: True
 # Useful for understanding how to optimize memory and performance
 profiler:
   _component_: torchtune.training.setup_torch_profiler
-  enabled: False
+  enabled: True
+  output_dir: ${output_dir}/profiling_outputs
+  cpu: True
+  cuda: True
+  profile_memory: True
+  with_stack: True
+  record_shapes: True
+  with_flops: False
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 1
+  num_cycles: 1
diff --git a/torchtune/models/llama4/_model_builders.py b/torchtune/models/llama4/_model_builders.py
@@ -78,7 +78,7 @@ def llama4_scout_17b_16e(
         norm_eps=1e-5,
         num_experts=16,
         use_shared_expert=True,
-        skip_rope_interval=4,
+        skip_rope_interval=None,
         attention_chunk_size=8192,
         use_scaled_rope=True,
     )
@@ -149,7 +149,7 @@ def llama4_maverick_17b_128e(
         use_qk_norm=False,
         moe_every_n_layers=2,
         mlp_hidden_dim=16384,
-        skip_rope_interval=4,
+        skip_rope_interval=None,
         attention_chunk_size=8192,
     )
     return EarlyFusionModel(
diff --git a/torchtune/modules/moe/experts.py b/torchtune/modules/moe/experts.py
@@ -50,6 +50,7 @@ def reset_parameters(self) -> None:
     # TODO: force no inference mode as a hack to get around
     # "Cannot set version_counter for inference tensor"
     @torch.inference_mode(mode=False)
+    @torch._dynamo.disable(recursive=False)
     def forward(
         self,
         x: torch.Tensor,
@@ -64,27 +65,57 @@ def forward(
         Returns:
             torch.Tensor: tensor with shape ``(bsz * seq_len * experts_per_token, dim)``
         """
+        self.use_grouped_mm = True
+        if not self.use_grouped_mm:
+            # a tuple of tensors indexed by experts
+            # each with shape (tokens_per_expert(varying), dim)
+            x = torch.split(
+                x,
+                split_size_or_sections=num_tokens_per_expert.tolist(),
+                dim=0,
+            )
+            out_experts_splits = []
+            for expert_idx, x_expert in enumerate(x):
+                w1, w2, w3 = (
+                    self.gate_proj[expert_idx],
+                    self.down_proj[expert_idx],
+                    self.up_proj[expert_idx],
+                )
+                h = self.act_fn(torch.matmul(x_expert, w1))
+                h = h * torch.matmul(x_expert, w3)
+                h = torch.matmul(h, w2)
+                # h shape (tokens_per_expert(varying), dim)
+                out_experts_splits.append(h)
+            out = torch.cat(out_experts_splits, dim=0)
 
-        # a tuple of tensors indexed by experts
-        # each with shape (tokens_per_expert(varying), dim)
-        x = torch.split(
-            x,
-            split_size_or_sections=num_tokens_per_expert.tolist(),
-            dim=0,
-        )
-        out_experts_splits = []
-        for expert_idx, x_expert in enumerate(x):
-            w1, w2, w3 = (
-                self.gate_proj[expert_idx],
-                self.down_proj[expert_idx],
-                self.up_proj[expert_idx],
+            return out
+
+        # grouped mm implementation
+        if num_tokens_per_expert is not None:
+            # https://github.com/pytorch/pytorch/pull/150374
+            # NOTE: torch._gouped_mm requires bf16 dtypes
+            #       and shapes to be multiple of 8
+            offsets = torch.cumsum(
+                num_tokens_per_expert, dim=0, dtype=torch.int32
             )
-            h = self.act_fn(torch.matmul(x_expert, w1))
-            h = h * torch.matmul(x_expert, w3)
-            h = torch.matmul(h, w2)
-            # h shape (tokens_per_expert(varying), dim)
-            out_experts_splits.append(h)
-        out = torch.cat(out_experts_splits, dim=0)
+            # grouped mm between a 2D tensor and a 3D tensor
+            assert x.dim() == 2
+        else:
+            offsets = None
+            # fall back to regular bmm between 3D tensors
+            assert x.dim() == 3
+
+        w1, w2, w3 = (
+            self.gate_proj,
+            self.down_proj,
+            self.up_proj,
+        )
+        assert (
+            x.dtype == w1.dtype == w2.dtype == w3.dtype == torch.bfloat16
+        ), "torch._grouped_mm only supports bf16 dtypes"
+        h = F.silu(torch._grouped_mm(x, w1, offs=offsets))
+        h = h * torch._grouped_mm(x, w3, offs=offsets)
+        out = torch._grouped_mm(h, w2, offs=offsets)
 
         return out
 
diff --git a/torchtune/modules/moe/indices.py b/torchtune/modules/moe/indices.py
@@ -0,0 +1,254 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import triton
+import triton.language as tl
+
+
+__all__ = ["generate_permute_indices"]
+
+
+# parallelized kernel
+@triton.jit
+def _fill_indices_kernel(
+    tokens_per_expert_group_ptr,
+    start_index_values_ptr,
+    write_offsets_ptr,
+    output_ptr,
+    experts_per_rank: tl.constexpr,
+    num_ranks: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,  # Number of threads per block
+):
+    pid = tl.program_id(axis=0)
+    num_programs = tl.num_programs(axis=0)
+
+    # map programs (blocks) to the experts and loop (grid stride) if needed
+    for expert_id in range(pid, experts_per_rank, num_programs):
+        # read this experts write offset
+        write_offset = tl.load(write_offsets_ptr + expert_id)
+
+        # loop over all ranks
+        for r in range(num_ranks):
+            # index into tokens_per_expert_group array
+            i = r * experts_per_rank + expert_id
+
+            # load start index and number of tokens for this expert-rank pair
+            start_index = tl.load(start_index_values_ptr + i)
+            length = tl.load(tokens_per_expert_group_ptr + i)
+
+            # each thread in block processes tokens in parallel
+            offsets = tl.arange(0, BLOCK_SIZE)
+
+            # tokens are processed in chunks of BLOCK_SIZE
+            for chunk_start in range(0, length, BLOCK_SIZE):
+                chunk_offsets = chunk_start + offsets
+
+                # mask valid indices
+                mask = chunk_offsets < length
+
+                values = start_index + chunk_offsets
+
+                # destination
+                dest_indices = write_offset + chunk_offsets
+
+                # store
+                tl.store(output_ptr + dest_indices, values, mask=mask)
+
+            # update write offset for next rank
+            write_offset += length
+
+
+# ==============
+# wrapper
+# ==============
+
+
+def fill_indices_wrapper(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+    block_size: int = 128,
+    max_blocks: int = 1024,  # cap on total number of blocks to launch
+):
+    # preallocate output
+    permuted_indices = torch.full(
+        (max_len,), -1, dtype=torch.int32, device=tokens_per_expert_group.device
+    )
+
+    # write offsets is per local expert...
+    num_blocks = min(experts_per_rank, max_blocks)
+    # grid = one block per expert unless capped and then we loop...
+    grid = (num_blocks,)
+
+    # launch kernel
+    _fill_indices_kernel[grid](
+        tokens_per_expert_group,
+        start_index_values,
+        write_offsets,
+        permuted_indices,
+        experts_per_rank,
+        num_ranks,
+        BLOCK_SIZE=block_size,
+    )
+    return permuted_indices
+
+
+# reference
+def fill_indices_cpu(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+):
+    # We need to preallocate the output - we ignore device and force it on cpu
+    # device = tokens_per_expert_group.device
+    permuted_indices = torch.full(
+        (max_len,),
+        -1,
+        dtype=torch.int32,
+    )  # device=device)
+    # Fill the permuted indices
+    # For each local expert
+    for e in range(experts_per_rank):
+        write_start = write_offsets[e].item()
+        # For each remote rank
+        for r in range(num_ranks):
+            i = r * experts_per_rank + e
+            start_index = start_index_values[i].item()
+            length = tokens_per_expert_group[i].item()
+            # Fill in the indices
+            if length > 0:
+                end_idx = min(write_start + length, max_len)
+                permuted_indices[write_start:end_idx] = torch.arange(
+                    start_index,
+                    start_index + (end_idx - write_start),
+                    dtype=torch.int32,
+                    # device=device,
+                )
+            write_start += length
+    return permuted_indices
+
+
+def generate_permute_indices(
+    tokens_per_expert_group: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+    alignment: int,
+    use_cpu: bool = False,
+):
+    """
+    Prepare permutation indices and the number of tokens for each expert.
+
+    Args:
+        tokens_per_expert_group: number of tokens for each expert from all ranks.
+        experts_per_rank: number of experts per rank.
+        num_ranks: number of ranks.
+        max_len: maximum length of the output index vector.
+        alignment: alignment for each returned element in `m_sizes`.
+        use_cpu: whether to use CPU implementation.
+        use_optimized: whether to use optimized Triton implementation.
+        block_size: block size for optimized implementation.
+
+    Returns:
+        permuted_indices: Tensor of indices that map original token order to the expert-grouped order.
+        m_sizes: aligned number of tokens for each expert (padded to alignment boundary).
+        m_offsets: Cumulative sum of m_sizes. The exclusive ending position for each expert's tokens.
+
+    Explanatory details:
+        `tokens_per_expert_group` is of shape (num_ranks * experts_per_rank,), for example:
+        From: |       rank 0      |       rank 1      |
+        To:   | E0 | E1 | E2 | E3 | E0 | E1 | E2 | E3 |
+              |  4 |  2 |  1 |  3 |  1 |  2 |  3 |  4 |
+    """
+    # prefix sum to get start index of each expert (parallel scan kernel in future?)
+    start_index_values = (
+        torch.cumsum(tokens_per_expert_group, 0) - tokens_per_expert_group
+    )
+
+    # chunk sizes for each expert
+    chunk_size_per_expert = tokens_per_expert_group.view(num_ranks, -1).sum(0)
+
+    # align the chunk sizes (cdiv)
+    m_sizes = ((chunk_size_per_expert + alignment - 1) // alignment * alignment).to(
+        torch.int32
+    )
+
+    # additional prefix sum to get write offset of each expert in permuted_indices
+    # write offsets is per local expert, not global
+    m_offsets = torch.cumsum(m_sizes, 0)
+    write_offsets = m_offsets - m_sizes
+
+    # Select the implementation to use
+    if use_cpu:
+        permuted_indices = fill_indices_cpu(
+            tokens_per_expert_group,
+            start_index_values,
+            write_offsets,
+            experts_per_rank,
+            num_ranks,
+            max_len,
+        )
+    else:
+        permuted_indices = fill_indices_wrapper(
+            tokens_per_expert_group,
+            start_index_values,
+            write_offsets,
+            experts_per_rank,
+            num_ranks,
+            max_len,
+        )
+
+    return permuted_indices, m_sizes, m_offsets.to(torch.int32)
+
+
+# Below is for testing only
+
+
+def simple_test():
+    device = torch.device("cuda", 0)
+    experts_per_rank = 4
+    num_ranks = 4
+    tokens_per_expert_group = torch.full(
+        (num_ranks * experts_per_rank,), 4, dtype=torch.int32, device=device
+    )
+    max_len = 128
+    alignment = 32
+    # Use the GPU kernel
+    permuted_indices_gpu, m_sizes, _ = generate_permute_indices(
+        tokens_per_expert_group, experts_per_rank, num_ranks, max_len, alignment
+    )
+    # Use the CPU method
+    permuted_indices_cpu, m_sizes, _ = generate_permute_indices(
+        tokens_per_expert_group,
+        experts_per_rank,
+        num_ranks,
+        max_len,
+        alignment,
+        use_cpu=True,
+    )
+    # Check that the results are the same
+
+    assert torch.equal(permuted_indices_gpu.cpu(), permuted_indices_cpu)
+    assert torch.equal(
+        torch.remainder(m_sizes, alignment),
+        torch.zeros(experts_per_rank, device=device),
+    )
+    # Print the results
+    print(f"{permuted_indices_gpu=}, \n{permuted_indices_cpu=}")
+    print(f"{m_sizes=}")
+    print("Success")
+    return True  # assert would have failed meaning getting here is success.
+
+
+if __name__ == "__main__":
+    simple_test()
diff --git a/torchtune/modules/moe/moe.py b/torchtune/modules/moe/moe.py
diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py