Add sglang to thirdparty test

LiyangLingIntel · Liyang Ling · commit 29e27115fc6b · 2025-04-09T23:06:05.000-07:00
diff --git a/.github/pins/sglang.txt b/.github/pins/sglang.txt
@@ -1 +1 @@
-0.4.4.post1
+0.4.5
diff --git a/.github/workflows/third-party-benchmarks.yml b/.github/workflows/third-party-benchmarks.yml
@@ -110,8 +110,8 @@ jobs:
 
       - name: Install SGLANG
         run: |
-          SGLANG_PIN_ID="$(<.github/pins/sglang.txt)"
-          pip install sglang==$SGLANG_PIN_ID
+          SGLANG_PIN="$(<.github/pins/sglang.txt)"
+          pip install sglang==$SGLANG_PIN
 
       - name: Run SGLANG attention prefill stage benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefill_attention_benchmark.py') }}
diff --git a/.github/workflows/third-party-tests.yml b/.github/workflows/third-party-tests.yml
@@ -96,6 +96,19 @@ jobs:
 
           pytest Liger-Kernel/test/
 
+      - name: Run SGLANG tests
+        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        run: |
+          pip install transformers pandas pytest openai
+
+          SGLANG_PIN="$(<.github/pins/sglang.txt)"
+          pip install datasets decord sglang==$SGLANG_PIN
+          git clone https://github.com/sgl-project/sglang.git --branch $SGLANG_PIN --single-branch
+
+          cd sglang
+          git apply ../benchmarks/third_party/sglang/sglang.patch
+          pytest sglang/test/srt/test_triton_attention_kernels.py
+
       - name: Upload test report
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         uses: actions/upload-artifact@v4
diff --git a/benchmarks/third_party/sglang/decode_attention_benchmark.py b/benchmarks/third_party/sglang/decode_attention_benchmark.py
@@ -5,11 +5,49 @@
 import triton_kernels_benchmark as benchmark_suit
 
 
+def gen_args(BATCH, N_CTX, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, dtype, device):
+
+    total_tokens = BATCH * N_CTX
+    sm_scale = 1.0 / (HEAD_DIM**0.5)
+    max_kv_splits = 8
+    num_kv_splits = torch.full((BATCH, ), 4, dtype=torch.int32, device=device)
+
+    # q represents the new token being generated, one per batch
+    q = torch.randn(BATCH, Q_HEAD_NUM, HEAD_DIM, dtype=dtype, device=device)
+
+    # k_buffer and v_buffer represent all previous tokens
+    k_buffer = torch.randn(total_tokens, KV_HEAD_NUM, HEAD_DIM, dtype=dtype, device=device)
+    v_buffer = torch.randn(total_tokens, KV_HEAD_NUM, HEAD_DIM, dtype=dtype, device=device)
+
+    # o will have the same shape as q
+    o = torch.zeros(BATCH, Q_HEAD_NUM, HEAD_DIM, dtype=dtype, device=device)
+
+    b_seq_len = torch.full((BATCH, ), N_CTX, device=device)
+
+    kv_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device=device)
+    kv_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len[:BATCH], dim=0)
+    kv_indices = torch.arange(total_tokens, device=device)
+
+    attn_logits = torch.empty(
+        (BATCH, Q_HEAD_NUM, max_kv_splits, HEAD_DIM),
+        dtype=torch.float32,
+        device=device,
+    )
+    attn_lse = torch.empty(
+        (BATCH, Q_HEAD_NUM, max_kv_splits),
+        dtype=torch.float32,
+        device=device,
+    )
+
+    return (q, k_buffer, v_buffer, o, kv_indptr, kv_indices, attn_logits, attn_lse, num_kv_splits, max_kv_splits,
+            sm_scale)
+
+
 # pylint: disable=unused-argument
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['BATCH', 'SEQ_LENS', 'Q_HEAD_NUM', 'KV_HEAD_NUM', 'HEAD_DIM', 'MODE', 'VALIDATE'],
+        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'MODE', 'VALIDATE'],
         x_vals=[  #
             [bs, [1024, 64], 32, 8, 128, 'fwd', False] for bs in [1, 16, 32, 64, 128]
         ] + [  #
@@ -34,42 +72,31 @@
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(BATCH, SEQ_LENS, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, MODE, VALIDATE, provider):
+def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, VALIDATE, provider):
     torch.manual_seed(0)
     dtype = torch.bfloat16
+    quantiles = [0.5, 0.0, 1.0]
     N_CTX = sum(SEQ_LENS)
-    total_tokens = BATCH * N_CTX
-    sm_scale = 1.0 / (HEAD_DIM**0.5)
-    num_kv_splits = 8
-
-    # q represents the new token being generated, one per batch
-    q = torch.randn(BATCH, Q_HEAD_NUM, HEAD_DIM, dtype=dtype, device='xpu')
-
-    # k_buffer and v_buffer represent all previous tokens
-    k_buffer = torch.randn(total_tokens, KV_HEAD_NUM, HEAD_DIM, dtype=dtype, device='xpu')
-    v_buffer = torch.randn(total_tokens, KV_HEAD_NUM, HEAD_DIM, dtype=dtype, device='xpu')
-
-    # o will have the same shape as q
-    o = torch.zeros(BATCH, Q_HEAD_NUM, HEAD_DIM, dtype=dtype, device='xpu')
 
-    b_seq_len = torch.full((BATCH, ), N_CTX, device='xpu')
-
-    kv_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device='xpu')
-    kv_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len[:BATCH], dim=0)
-    kv_indices = torch.arange(total_tokens, device='xpu')
-
-    attn_logits = torch.empty(
-        (BATCH, Q_HEAD_NUM, num_kv_splits, HEAD_DIM + 1),
-        dtype=torch.float32,
-        device='xpu',
-    )
-
-    quantiles = [0.5, 0.0, 1.0]
+    q, k_buffer, v_buffer, o, kv_indptr, kv_indices, attn_logits, attn_lse, num_kv_splits, max_kv_splits, sm_scale = gen_args(
+        B, N_CTX, H_Q, H_KV, D, dtype, 'xpu')
 
     if provider == 'triton':
-        triton_fn = lambda: decode_attention_fwd(q, k_buffer, v_buffer, o, kv_indptr, kv_indices, attn_logits,
-                                                 num_kv_splits, sm_scale, logit_cap=0.0)
-        # TODO: decode attention do not have validation function
+        triton_fn = lambda: decode_attention_fwd(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits,
+            attn_lse,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+        )
+
+        # TODO: decode attention should have the validation function
         if VALIDATE:
             raise NotImplementedError('Validation is not implemented for decode stage')
 
@@ -78,9 +105,8 @@ def benchmark(BATCH, SEQ_LENS, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, MODE, VALIDATE
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
-    tflops = lambda ms: 2 * BATCH * (Q_HEAD_NUM + KV_HEAD_NUM * N_CTX) * N_CTX * HEAD_DIM * (1e-12) / (ms * 1e-3)
-
-    gbps = lambda ms: 2 * BATCH * (Q_HEAD_NUM + KV_HEAD_NUM * N_CTX) * HEAD_DIM * 2 * (1e-9) / (ms * 1e-3)
+    tflops = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * N_CTX * D * (1e-12) / (ms * 1e-3)
+    gbps = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * D * 2 * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv
 
diff --git a/benchmarks/third_party/sglang/extended_attention_benchmark.py b/benchmarks/third_party/sglang/extended_attention_benchmark.py
@@ -6,11 +6,67 @@
 import triton_kernels_benchmark as benchmark_suit
 
 
+def gen_args(BATCH, N_CTX, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, dtype, device):
+
+    b_seq_len_prefix = torch.randint(1, N_CTX // 2, (BATCH, ), dtype=torch.int32, device=device)
+    b_seq_len_extend = torch.randint(1, N_CTX // 2, (BATCH, ), dtype=torch.int32, device=device)
+    b_seq_len = b_seq_len_prefix + b_seq_len_extend
+    max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
+
+    b_req_idx = torch.arange(BATCH, dtype=torch.int32, device=device)
+    b_start_loc = torch.zeros((BATCH, ), dtype=torch.int32, device=device)
+    b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+    b_start_loc_extend = torch.zeros((BATCH, ), dtype=torch.int32, device=device)
+    b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+
+    kv_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device=device)
+    kv_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len_prefix[:BATCH], dim=0)
+    kv_indices = torch.zeros((b_seq_len_prefix.sum().item(), ), dtype=torch.int32, device=device)
+
+    for i in range(BATCH):
+        kv_indices[kv_indptr[i]:kv_indptr[i + 1]] = torch.arange(b_start_loc[i], b_start_loc[i] + b_seq_len_prefix[i])
+
+    total_token_num = torch.sum(b_seq_len).item()
+    extend_token_num = torch.sum(b_seq_len_extend).item()
+    k_buffer = torch.empty((total_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype,
+                           device=device).normal_(mean=0.1, std=0.2)
+    v_buffer = torch.empty((total_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype,
+                           device=device).normal_(mean=0.1, std=0.2)
+
+    k_extend = torch.empty((extend_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
+    v_extend = torch.empty((extend_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
+    q_extend = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
+    for i in range(BATCH):
+        extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+        extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+        extend_start = b_start_loc_extend[i]
+        extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
+        k_extend[extend_start:extend_end] = k_buffer[extend_start_in_buffer:extend_end_in_buffer]
+        v_extend[extend_start:extend_end] = v_buffer[extend_start_in_buffer:extend_end_in_buffer]
+        q_extend[extend_start:extend_end] = torch.empty((b_seq_len_extend[i], Q_HEAD_NUM, HEAD_DIM), dtype=dtype,
+                                                        device=device).normal_(mean=0.1, std=0.2)
+
+    o_extend = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
+    o_redundant = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
+
+    b_seq_len_extend = b_seq_len - b_seq_len_prefix
+    max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+    qo_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device=device)
+    qo_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len_extend[:BATCH], dim=0)
+
+    params = []
+    params.append((q_extend, k_extend, v_extend, o_extend, o_redundant))
+    params.append((k_buffer, v_buffer))
+    params.append((qo_indptr, kv_indptr, kv_indices, max_len_extend))
+    params.append((b_req_idx, b_start_loc, b_seq_len, b_seq_len_prefix, max_len_in_batch))
+    return params
+
+
 # pylint: disable=unused-argument
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['BATCH', 'SEQ_LENS', 'Q_HEAD_NUM', 'KV_HEAD_NUM', 'HEAD_DIM', 'MODE', 'VALIDATE'],
+        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'MODE', 'VALIDATE'],
         x_vals=[  #
             [bs, [1024, 128, 512], 32, 8, 128, 'fwd', True] for bs in [1, 16, 32, 64, 128]
         ] + [  #
@@ -35,57 +91,17 @@
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(BATCH, SEQ_LENS, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, MODE, VALIDATE, provider):
+def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, VALIDATE, provider):
     torch.manual_seed(0)
+
     dtype = torch.bfloat16
     N_CTX = sum(SEQ_LENS)
 
-    b_seq_len_prefix = torch.randint(1, N_CTX // 2, (BATCH, ), dtype=torch.int32, device='xpu')
-    b_seq_len_extend = torch.randint(1, N_CTX // 2, (BATCH, ), dtype=torch.int32, device='xpu')
-    b_seq_len = b_seq_len_prefix + b_seq_len_extend
-    max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
-
-    b_req_idx = torch.arange(BATCH, dtype=torch.int32, device='xpu')
-    b_start_loc = torch.zeros((BATCH, ), dtype=torch.int32, device='xpu')
-    b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
-    b_start_loc_extend = torch.zeros((BATCH, ), dtype=torch.int32, device='xpu')
-    b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
-
-    kv_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device='xpu')
-    kv_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len_prefix[:BATCH], dim=0)
-    kv_indices = torch.zeros((b_seq_len_prefix.sum().item(), ), dtype=torch.int32, device='xpu')
-
-    for i in range(BATCH):
-        kv_indices[kv_indptr[i]:kv_indptr[i + 1]] = torch.arange(b_start_loc[i], b_start_loc[i] + b_seq_len_prefix[i])
-
-    total_token_num = torch.sum(b_seq_len).item()
-    extend_token_num = torch.sum(b_seq_len_extend).item()
-    k_buffer = torch.empty((total_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype,
-                           device='xpu').normal_(mean=0.1, std=0.2)
-    v_buffer = torch.empty((total_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype,
-                           device='xpu').normal_(mean=0.1, std=0.2)
-
-    k_extend = torch.empty((extend_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype, device='xpu')
-    v_extend = torch.empty((extend_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype, device='xpu')
-    q_extend = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device='xpu')
-    for i in range(BATCH):
-        extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
-        extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
-        extend_start = b_start_loc_extend[i]
-        extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
-        k_extend[extend_start:extend_end] = k_buffer[extend_start_in_buffer:extend_end_in_buffer]
-        v_extend[extend_start:extend_end] = v_buffer[extend_start_in_buffer:extend_end_in_buffer]
-        q_extend[extend_start:extend_end] = torch.empty((b_seq_len_extend[i], Q_HEAD_NUM, HEAD_DIM), dtype=dtype,
-                                                        device='xpu').normal_(mean=0.1, std=0.2)
-
-    o_extend = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device='xpu')
-    o_redundant = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device='xpu')
-
-    b_seq_len_extend = b_seq_len - b_seq_len_prefix
-    max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
-    qo_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device='xpu')
-    qo_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len_extend[:BATCH], dim=0)
-
+    params = gen_args(B, N_CTX, H_Q, H_KV, D, dtype, 'xpu')
+    q_extend, k_extend, v_extend, o_extend, o_redundant = params[0]
+    k_buffer, v_buffer = params[1]
+    qo_indptr, kv_indptr, kv_indices, max_len_extend = params[2]
+    b_req_idx, b_start_loc, b_seq_len, b_seq_len_prefix, max_len_in_batch = params[3]
     custom_mask = None
     mask_indptr = None
 
@@ -97,7 +113,6 @@ def triton_fn():
                                  kv_indices, custom_mask, mask_indptr, max_len_extend)
             return o_extend
 
-        # TODO: decode attention do not have validation function
         if VALIDATE:
 
             def refer_fn():
@@ -112,9 +127,8 @@ def refer_fn():
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
-    tflops = lambda ms: 2 * BATCH * (Q_HEAD_NUM + KV_HEAD_NUM * N_CTX) * N_CTX * HEAD_DIM * (1e-12) / (ms * 1e-3)
-
-    gbps = lambda ms: 2 * BATCH * (Q_HEAD_NUM + KV_HEAD_NUM * N_CTX) * HEAD_DIM * 2 * (1e-9) / (ms * 1e-3)
+    tflops = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * N_CTX * D * (1e-12) / (ms * 1e-3)
+    gbps = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * D * 2 * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv
 
diff --git a/benchmarks/third_party/sglang/prefill_attention_benchmark.py b/benchmarks/third_party/sglang/prefill_attention_benchmark.py
@@ -5,16 +5,33 @@
 import triton_kernels_benchmark as benchmark_suit
 
 
+def gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, device):
+    max_seq_len = max(SEQ_LENS)
+    N_CTX = sum(SEQ_LENS)
+
+    # Create random input tensors
+    q = torch.randn((B * N_CTX, H_Q, D), device=device, dtype=dtype)
+    k = torch.randn((B * N_CTX, H_KV, D), device=device, dtype=dtype)
+    v = torch.randn((B * N_CTX, H_KV, D), device=device, dtype=dtype)
+    o = torch.zeros((B * N_CTX, H_Q, D), device=device, dtype=dtype)
+
+    # Create b_start_loc and b_seq_len tensors
+    b_start_loc = torch.tensor([0, SEQ_LENS[0]], device=device)
+    b_seq_len = torch.tensor(SEQ_LENS, device=device)
+
+    return (q, k, v, o, b_start_loc, b_seq_len, max_seq_len)
+
+
 # pylint: disable=unused-argument
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['BATCH', 'SEQ_LENS', 'Q_HEAD_NUM', 'KV_HEAD_NUM', 'HEAD_DIM', 'CAUSAL', 'MODE', 'VALIDATE'],
+        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'CAUSAL', 'MODE', 'VALIDATE'],
         x_vals=[  #
             [bs, [1024], 32, 8, 128, causal, 'fwd', False] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
-        ] + [  # noqa
+        ] + [  #
             [bs, [1024], 32, 32, 96, causal, 'fwd', False] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
-        ] + [  # noqa
+        ] + [  #
             [bs, [1024], 28, 4, 128, causal, 'fwd', False] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
         ],
         line_arg='provider',
@@ -34,30 +51,20 @@
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(BATCH, SEQ_LENS, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, CAUSAL, MODE, VALIDATE, provider):
+def benchmark(B, SEQ_LENS, H_Q, H_KV, D, CAUSAL, MODE, VALIDATE, provider):
     torch.manual_seed(0)
     dtype = torch.bfloat16
-    device = 'xpu'
     N_CTX = sum(SEQ_LENS)
-    max_seq_len = max(SEQ_LENS)
 
-    # Create random input tensors
-    q = torch.randn((BATCH * N_CTX, Q_HEAD_NUM, HEAD_DIM), device=device, dtype=dtype)
-    k = torch.randn((BATCH * N_CTX, KV_HEAD_NUM, HEAD_DIM), device=device, dtype=dtype)
-    v = torch.randn((BATCH * N_CTX, KV_HEAD_NUM, HEAD_DIM), device=device, dtype=dtype)
-    o = torch.zeros((BATCH * N_CTX, Q_HEAD_NUM, HEAD_DIM), device=device, dtype=dtype)
-
-    # Create b_start_loc and b_seq_len tensors
-    b_start_loc = torch.tensor([0, SEQ_LENS[0]], device=device)
-    b_seq_len = torch.tensor(SEQ_LENS, device=device)
+    q, k, v, o, b_start_loc, b_seq_len, max_seq_len = gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, 'xpu')
 
     quantiles = [0.5, 0.0, 1.0]
     if provider == 'triton':
 
         triton_fn = lambda: context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=CAUSAL)
 
         if VALIDATE:
-            # FIXME: torch sdpa does not support different Q_HEAD_NUM and KV_HEAD_NUM
+            # FIXME: torch sdpa does not support different H_Q and H_KV
             cu_seq_lens = [0] * (len(SEQ_LENS) + 1)
             for i, seq_len in enumerate(SEQ_LENS):
                 cu_seq_lens[i + 1] = cu_seq_lens[i] + seq_len
@@ -81,9 +88,8 @@ def benchmark(BATCH, SEQ_LENS, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, CAUSAL, MODE,
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
-    tflops = lambda ms: 2 * BATCH * (Q_HEAD_NUM + KV_HEAD_NUM) * N_CTX * N_CTX * HEAD_DIM * (1e-12) / (ms * 1e-3)
-
-    gbps = lambda ms: 2 * BATCH * (Q_HEAD_NUM + KV_HEAD_NUM) * N_CTX * HEAD_DIM * 2 * (1e-9) / (ms * 1e-3)
+    tflops = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * N_CTX * D * (1e-12) / (ms * 1e-3)
+    gbps = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * D * 2 * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean_ms), gbps(max_ms), gbps(min_ms)), (tflops(mean_ms), tflops(max_ms), tflops(min_ms)), cv
 
diff --git a/benchmarks/third_party/sglang/sglang.patch b/benchmarks/third_party/sglang/sglang.patch
diff --git a/python/setup.py b/python/setup.py