bugfix: adding lse output to blackwell fmha kernels (#1071)

yzh119 · web-flow · commit 7bdadce0a44c · 2025-05-19T23:12:09.000-07:00
diff --git a/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp
@@ -49,7 +49,7 @@ struct Sm100FmhaFwdEpilogueTmaWarpspecialized {
   using LayoutO = cute::Layout<ShapeT, StrideO>;
 
   using ShapeLSE = cute::Shape<int32_t, cute::Shape<int32_t, int32_t>>;
-  using StrideLSE = cute::Shape<_1, cute::Shape<int32_t, int32_t>>;
+  using StrideLSE = cute::Shape<int32_t, cute::Shape<_1, int32_t>>;
   using LayoutLSE = cute::Layout<ShapeLSE, StrideLSE>;
 
   //  using SmemLayoutO = decltypa(make_layout(append<3>(select<0,1>(TileShape_WG{}), _2{})));
@@ -103,6 +103,10 @@ struct Sm100FmhaFwdEpilogueTmaWarpspecialized {
     cute::prefetch_tma_descriptor(params.tma_store_o.get_tma_descriptor());
   }
 
+  const Params& params;
+
+  CUTLASS_DEVICE Sm100FmhaFwdEpilogueTmaWarpspecialized(const Params& params) : params(params) {}
+
   template <class BlkCoord, class ProblemShape, class ParamsProblemShape>
   CUTLASS_DEVICE auto store(BlkCoord const& blk_coord, ProblemShape const& problem_shape,
                             Params const& params, ParamsProblemShape const& params_problem_shape,
@@ -120,10 +124,6 @@ struct Sm100FmhaFwdEpilogueTmaWarpspecialized {
     int o0_index = 2 * get<0>(blk_coord);
     int o1_index = 2 * get<0>(blk_coord) + 1;
 
-    // Tensor mLSE = make_tensor(make_gmem_ptr(params.ptr_LSE), params.layout_LSE);
-    // Tensor gLSE = get_lse_local_tile_tensor(mLSE, Shape<Int<CTA_Q>>{}, qo_head_idx, qo_indptr,
-    //                                         qo_len)(_, qo_tile_idx);
-
     int max_length_q = get<0>(params_problem_shape).max_length;
     int offs_0 = max_length_q - qo_len;
     int offs_2_1 = qo_segment_offset + qo_len;
diff --git a/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
@@ -906,14 +906,17 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
     }
   }
 
-  template <class BlkCoord, class ProblemShape, class TensorStorageEpi>
+  template <class BlkCoord, class ParamsProblemShape, class ProblemShape, class TensorStorageEpi,
+            class CollectiveEpilogue>
   CUTLASS_DEVICE auto correction(
-      BlkCoord const& blk_coord, Params const& params, ProblemShape const& problem_shape,
+      BlkCoord const& blk_coord, Params const& params,
+      ParamsProblemShape const& params_problem_shape, ProblemShape const& problem_shape,
       TensorStorageEpi& shared_storage_epi, PipelineC& pipeline_s0_c,
       typename PipelineC::PipelineState& pipeline_s0_c_consumer_state, PipelineC& pipeline_s1_c,
       typename PipelineC::PipelineState& pipeline_s1_c_consumer_state, PipelineO& pipeline_o,
       typename PipelineO::PipelineState& pipeline_o_consumer_state, PipelineE& pipeline_epi,
-      typename PipelineE::PipelineState& pipeline_epi_producer_state) {
+      typename PipelineE::PipelineState& pipeline_epi_producer_state,
+      CollectiveEpilogue& epilogue) {
     int mask_tile_count = Mask{}.get_trip_count(blk_coord, TileShape{}, problem_shape);
 
     int thread_idx = threadIdx.x % (4 * cutlass::NumThreadsPerWarp);
@@ -1024,7 +1027,23 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
     //    store to smem
     Tensor sO = make_tensor(make_smem_ptr(shared_storage_epi.smem_o.data()),
                             typename TensorStorageEpi::SmemLayoutO{});
+    Tensor gLSE = make_tensor(make_gmem_ptr(epilogue.params.ptr_LSE), epilogue.params.layout_LSE);
     correction_epilogue(params.scale_output / tTMEM_LOADVrS(kIdxFinalRowSum), _0{}, sO);
+    if (epilogue.params.ptr_LSE != nullptr) {
+      int qo_tile_idx = get<0>(blk_coord);
+      int qo_head_idx = get<2, 0>(blk_coord);
+      int batch_idx = get<2, 1>(blk_coord);
+      int qo_len = get<0>(problem_shape);
+      int segment_offset = get<0>(params_problem_shape).segment_offsets[batch_idx];
+      int row_idx = get<0>(tTMEM_LOADVcS(_0{})) + get<0>(TileShape{}) * qo_tile_idx;
+
+      ElementPV lse = __log2f(tTMEM_LOADVrS(kIdxFinalRowSum)) +
+                      params.scale_softmax_log2 * tTMEM_LOADVrS(kIdxFinalRowMax);
+
+      if (row_idx < qo_len) {
+        gLSE(segment_offset + row_idx, qo_head_idx) = lse;
+      }
+    }
     // correction_epilogue(params.scale_output, _0{}, sO);
 
     cutlass::arch::fence_view_async_tmem_load();
@@ -1047,6 +1066,23 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
     pipeline_epi.producer_acquire(pipeline_epi_producer_state);
 
     correction_epilogue(params.scale_output / tTMEM_LOADVrS(kIdxFinalRowSum), _1{}, sO);
+
+    if (epilogue.params.ptr_LSE != nullptr) {
+      int qo_tile_idx = get<0>(blk_coord);
+      int qo_head_idx = get<2, 0>(blk_coord);
+      int batch_idx = get<2, 1>(blk_coord);
+      int qo_len = get<0>(problem_shape);
+      int segment_offset = get<0>(params_problem_shape).segment_offsets[batch_idx];
+      int row_idx =
+          get<0>(tTMEM_LOADVcS(_0{})) + get<0>(TileShape{}) * qo_tile_idx + get<0>(TileShapeQK{});
+
+      ElementPV lse = __log2f(tTMEM_LOADVrS(kIdxFinalRowSum)) +
+                      params.scale_softmax_log2 * tTMEM_LOADVrS(kIdxFinalRowMax);
+
+      if (row_idx < qo_len) {
+        gLSE(segment_offset + row_idx, qo_head_idx) = lse;
+      }
+    }
     // correction_epilogue(params.scale_output, _1{}, sO);
     cutlass::arch::fence_view_async_tmem_load();
 
diff --git a/include/flashinfer/attention/blackwell/fmha_cutlass_sm100.cuh b/include/flashinfer/attention/blackwell/fmha_cutlass_sm100.cuh
@@ -56,7 +56,7 @@ struct FwdRunner {
   // NOTE(Zihao): use markus's trick for tma store
   using StrideO =
       cute::tuple<int, _1, cute::tuple<cute::tuple<int, int>, int>>;  // Q D (H_G H_R) CUMULATIVE_Q
-  using StrideLSE = cute::tuple<_1, cute::tuple<int, int>>;           // Q (H_G H_R)
+  using StrideLSE = cute::tuple<int, cute::tuple<_1, int>>;           // Q (H_G H_R)
 
   using Mainloop = cutlass::fmha::collective::Sm100FmhaFwdMainloopTmaWarpspecialized<
       Element, ElementAccumulatorQK, ElementAccumulatorPV, TileShapeQK, TileShapePV, StrideQ,
@@ -108,7 +108,7 @@ struct FwdRunner {
         make_stride(make_stride(head_dim_vo, h_r * head_dim_vo), num_qo_heads * head_dim_vo));
     stride_K = make_stride(num_kv_heads * head_dim_qk, _1{}, make_stride(_0{}, head_dim_qk));
     stride_V = make_stride(_1{}, num_kv_heads * head_dim_vo, make_stride(_0{}, head_dim_vo));
-    stride_LSE = make_stride(_1{}, make_stride(total_qo_len, total_qo_len * h_r));
+    stride_LSE = make_stride(num_qo_heads, make_stride(_1{}, h_r));
 
     auto shape_Q = make_shape(total_qo_len, head_dim_qk, make_shape(h_r, num_kv_heads));
     auto shape_O = make_shape(max_qo_len, head_dim_vo,
diff --git a/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp
@@ -360,7 +360,7 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
         cutlass::make_producer_start_state<typename CollectiveMainloop::PipelineO>();
 
     CollectiveMainloop mainloop;
-    CollectiveEpilogue epilogue;
+    CollectiveEpilogue epilogue{params.epilogue};
 
     if (role == WarpRole::Softmax0 || role == WarpRole::Softmax1) {
       warpgroup_reg_set<NumRegsSoftmax>();
@@ -400,11 +400,12 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
           continue;
         }
 
-        mainloop.correction(
-            blk_coord, params.mainloop, logical_problem_shape, shared_storage.epilogue,
-            pipeline_s0_corr, pipeline_s0_corr_consumer_state, pipeline_s1_corr,
-            pipeline_s1_corr_consumer_state, pipeline_mma_corr, pipeline_mma_corr_consumer_state,
-            pipeline_corr_epi, pipeline_corr_epi_producer_state);
+        mainloop.correction(blk_coord, params.mainloop, params.problem_shape, logical_problem_shape,
+                            shared_storage.epilogue, pipeline_s0_corr,
+                            pipeline_s0_corr_consumer_state, pipeline_s1_corr,
+                            pipeline_s1_corr_consumer_state, pipeline_mma_corr,
+                            pipeline_mma_corr_consumer_state, pipeline_corr_epi,
+                            pipeline_corr_epi_producer_state, epilogue);
       }
 
       if constexpr (NumWarpsEpilogue == 0) {
diff --git a/tests/test_blackwell_fmha.py b/tests/test_blackwell_fmha.py
@@ -62,7 +62,7 @@ def attention_ref(
 @pytest.mark.parametrize("head_dim_qk", [192, 128])
 @pytest.mark.parametrize("head_dim_vo", [128])
 @pytest.mark.parametrize("causal", [False, True])
-@pytest.mark.parametrize("dtype", [torch.half])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
 def test_blackwell_cutlass_fmha(
     batch_size,
     qo_len,
@@ -117,41 +117,32 @@ def test_blackwell_cutlass_fmha(
     )
     o, lse = wrapper.run(q, k, v, return_lse=True)
 
-    # gqa_group_ratio = num_qo_heads // num_kv_heads
-    # k_repeated = torch.repeat_interleave(k, gqa_group_ratio, dim=1)
-    # v_repeated = torch.repeat_interleave(v, gqa_group_ratio, dim=1)
-    # o_ref, lse_ref = attention_ref(
-    #     batch_size, q, k_repeated, v_repeated, causal, sm_scale
-    # )
-
-    # lse_ref = lse_ref.flatten(0, 1)
-    # if dtype == torch.half:
-    #     torch.testing.assert_close(o, o_ref, rtol=1e-3, atol=1e-3)
-    # else:
-    #     torch.testing.assert_close(o, o_ref, rtol=1e-2, atol=1e-2)
+    gqa_group_ratio = num_qo_heads // num_kv_heads
+    k_repeated = torch.repeat_interleave(k, gqa_group_ratio, dim=1)
+    v_repeated = torch.repeat_interleave(v, gqa_group_ratio, dim=1)
+    o_ref, lse_ref = attention_ref(
+        batch_size, q, k_repeated, v_repeated, causal, sm_scale
+    )
 
-    # torch.testing.assert_close(lse, lse_ref, rtol=1e-3, atol=1e-3)
+    lse_ref = lse_ref.flatten(0, 1)
+    if dtype == torch.half:
+        torch.testing.assert_close(o, o_ref, rtol=1e-3, atol=1e-3)
+    else:
+        torch.testing.assert_close(o, o_ref, rtol=1e-2, atol=1e-2)
 
-    # test with pre-allocated output
-    # o_buffer = torch.empty_like(o)
-    # lse_buffer = torch.empty_like(lse)
-    # flashinfer.prefill.fmha(
-    #     q, k, v, qo_lens, kv_lens, out=o_buffer, lse=lse_buffer, causal=causal
-    # )
-    # torch.testing.assert_close(o, o_buffer, rtol=1e-3, atol=1e-3)
-    # torch.testing.assert_close(lse, lse_buffer, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(lse, lse_ref, rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":
     test_blackwell_cutlass_fmha(
         1,
-        1,
-        1,
         32,
+        32,
+        4,
         4,
         192,
         128,
-        False,
+        True,
         torch.bfloat16,
         # 3,
         # 999,