Skip to content

Commit c5fa4c8

Browse files
[ROCm] update deepseek FP4 script (#821)
Signed-off-by: zejunchen-zejun <[email protected]>
1 parent cc28cd9 commit c5fa4c8

File tree

2 files changed

+5
-10
lines changed

2 files changed

+5
-10
lines changed

evaluation/deepseek_fp4/launch_deepseekr1_fp4_DP_EP.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
export VLLM_USE_V1=1
2-
export VLLM_USE_TRITON_FLASH_ATTN=0
32
# export VLLM_LOGGING_LEVEL=DEBUG
43
export VLLM_RPC_TIMEOUT=1800000
54
export VLLM_ROCM_USE_AITER=1
@@ -10,7 +9,7 @@ export VLLM_ROCM_USE_TRITON_ROPE=1 # add for acc
109
export VLLM_DISABLE_COMPILE_CACHE=1
1110
# FIXME: for now disable fp4 asm gemm because of running issue
1211
export VLLM_ROCM_USE_AITER_FP4_ASM_GEMM=0
13-
#export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 # for now disable
12+
export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 # disable for acc
1413

1514
export TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1
1615
export TRITON_HIP_USE_ASYNC_COPY=1
@@ -28,7 +27,6 @@ export SAFETENSORS_FAST_GPU=1
2827
model_path=/data/pretrained-models/amd/DeepSeek-R1-MXFP4-Preview
2928
echo "running $model_path"
3029

31-
# FIXME: for now use 0.8 for memory utilization
3230
vllm serve $model_path \
3331
--host localhost \
3432
--port 9000 \

evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
export VLLM_USE_V1=1
2-
export VLLM_USE_TRITON_FLASH_ATTN=1 # use triton mha
32
# export VLLM_LOGGING_LEVEL=DEBUG
43
export VLLM_RPC_TIMEOUT=1800000
54
export VLLM_ROCM_USE_AITER=1
65
export VLLM_ROCM_USE_AITER_MHA=0
7-
export VLLM_ROCM_USE_AITER_MLA=0 # use triton mha
6+
export VLLM_ROCM_USE_AITER_MLA=1
87
export VLLM_ROCM_USE_AITER_MOE=1
98
export VLLM_ROCM_USE_TRITON_ROPE=1 # add for acc
109
export VLLM_DISABLE_COMPILE_CACHE=1
@@ -28,7 +27,6 @@ export SAFETENSORS_FAST_GPU=1
2827
model_path=/data/pretrained-models/amd/DeepSeek-R1-MXFP4-Preview
2928
echo "running $model_path"
3029

31-
# FIXME: for now use 0.8 for memory utilization
3230
vllm serve $model_path \
3331
--host localhost \
3432
--port 9000 \
@@ -37,12 +35,11 @@ vllm serve $model_path \
3735
--trust-remote-code \
3836
--no-enable-prefix-caching \
3937
--disable-log-requests \
40-
--enforce-eager \
41-
--gpu_memory_utilization 0.7 \
38+
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
39+
--gpu_memory_utilization 0.8 \
4240
--async-scheduling \
4341
--block-size 16 \
4442
--load-format fastsafetensors \
4543
--seed 123 2>&1 | tee log.server.log &
4644

47-
# --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
48-
# --enable-expert-parallel \
45+
# --enable-expert-parallel \

0 commit comments

Comments
 (0)