[FP8][Kernel] add envs.VLLM_USE_CUTLASS_MOE_FP8

JackChuang · JackChuang · commit 5be9ad1a64b4 · 2025-06-19T07:09:08.000Z
A flag named `VLLM_USE_CUTLASS_MOE_FP8` controls whether to activate
the Cutlass kernel. By default, this flag is disabled, ensuring
the original execution path remains completely untouched.

Usage:
$ VLLM_USE_CUTLASS_MOE_FP8=1 python3 -m vllm.entrypoints.openai.api_server ...
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -109,6 +109,7 @@
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
+    VLLM_USE_CUTLASS_MOE_FP8: bool = False
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
 
 
@@ -718,6 +719,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_XGRAMMAR_CACHE_MB":
     lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
 
+    # Flag to control if vllm should use CUTLASS kernel for MoE FP8
+    "VLLM_USE_CUTLASS_MOE_FP8":
+    lambda: (os.environ.get("VLLM_USE_CUTLASS_MOE_FP8", "False").lower() in
+                 ("true", "1")),
+
     # Control the threshold for msgspec to use 'zero copy' for
     # serialization/deserialization of tensors. Tensors below
     # this limit will be encoded into the msgpack buffer, and