bugfix: follow user-specified sm_scale for blackwell cutlass fmha (#1072)

yzh119 · web-flow · commit fc62e3f2f478 · 2025-05-20T08:57:14.000-07:00
## 📌 Description Use user-specified instead of hardcoded sm_scale for blackwell cutlass fmha kernel. cc @nandor --- ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.).
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -2638,7 +2638,8 @@ def fmha_varlen(
     nnz_kv, num_kv_heads, head_dim_vo = v.shape
 
     mask_mode_code = 1 if causal else 0
-    sm_scale = 1.0 / math.sqrt(head_dim_qk)
+    if sm_scale is None:
+        sm_scale = 1.0 / math.sqrt(head_dim_qk)
 
     qo_lens = qo_segment_offsets[1:] - qo_segment_offsets[:-1]
     kv_lens = kv_segment_offsets[1:] - kv_segment_offsets[:-1]
diff --git a/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
@@ -171,8 +171,7 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
   struct Arguments {
     typename Load::Arguments load;
 
-    // if zero, defaults to 1/sqrt(D)
-    float scale_softmax = 0.0f;
+    float scale_softmax;
 
     // scaling factors to dequantize QKV
     float scale_q = 1.0f;
@@ -201,9 +200,6 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
   static Params to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args,
                                         void* workspace) {
     float scale_softmax = args.scale_softmax;
-    if (scale_softmax == 0.0f) {
-      scale_softmax = 1.0f / (float)std::sqrt(get<2>(problem_shape));
-    }
     float log2_e = static_cast<float>(std::log2(std::exp(1.0)));
 
     return Params{Load::to_underlying_arguments(problem_shape, args.load, workspace),
diff --git a/include/flashinfer/attention/blackwell/fmha_cutlass_sm100.cuh b/include/flashinfer/attention/blackwell/fmha_cutlass_sm100.cuh
@@ -126,7 +126,7 @@ struct FwdRunner {
     typename Operation::Arguments arguments{
         problem_shape,
         {static_cast<Element*>(q.data_ptr()), layout_Q, static_cast<Element*>(k.data_ptr()),
-         layout_K, static_cast<Element*>(v.data_ptr()), layout_V},
+         layout_K, static_cast<Element*>(v.data_ptr()), layout_V, float(sm_scale)},
         {static_cast<ElementOut*>(o.data_ptr()) - max_qo_len * get<0>(stride_O), layout_O,
          static_cast<ElementAccumulatorPV*>(maybe_lse.value().data_ptr()), layout_LSE},
         hw_info};
diff --git a/tests/test_blackwell_fmha.py b/tests/test_blackwell_fmha.py
@@ -61,6 +61,7 @@ def attention_ref(
 @pytest.mark.parametrize("num_kv_heads", [4, 32])
 @pytest.mark.parametrize("head_dim_qk", [192, 128])
 @pytest.mark.parametrize("head_dim_vo", [128])
+@pytest.mark.parametrize("sm_scale", [1.0, 1.0 / math.sqrt(192), 1.0 / math.sqrt(128)])
 @pytest.mark.parametrize("causal", [False, True])
 @pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
 def test_blackwell_cutlass_fmha(
@@ -71,6 +72,7 @@ def test_blackwell_cutlass_fmha(
     num_kv_heads,
     head_dim_qk,
     head_dim_vo,
+    sm_scale,
     causal,
     dtype,
 ):
@@ -102,7 +104,6 @@ def test_blackwell_cutlass_fmha(
         kv_layout="NHD",
         backend="cutlass",
     )
-    sm_scale = 1.0 / (head_dim_qk**0.5)
     wrapper.plan(
         qo_indptr,
         kv_indptr,
@@ -142,6 +143,7 @@ def test_blackwell_cutlass_fmha(
         4,
         192,
         128,
+        1,
         True,
         torch.bfloat16,
         # 3,