NVIDIA · trtllm-agent · Jul 3, 2026
@@ -774,13 +774,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             layer_type="moe",
         )
 
-        # Single merge-point all_reduce for routed + shared partial sums.
-        # Both branches produce per-rank partial outputs under TP/EP sharding
-        # (routed: MoEShardableNode; shared: rowwise down_proj inside the MLP).
-        # One reduction on the sum lifts both to full; reducing before the add
-        # would mix a full routed contribution with an unreduced shared one.
-        expert_output = expert_output + shared_expert_output
+        # The shared expert is replicated (Qwen3_5MoeMLP intentionally omits
+        # ``layer_type`` and the yaml ``shard_layers`` whitelist excludes it),
+        # so its output is already the full value on every rank. All-reduce
+        # the sharded routed-expert partial first, then add the replicated
+        # shared output; adding before would scale the shared output by the
+        # TP world size.
         expert_output = torch.ops.auto_deploy.all_reduce(expert_output, layer_type="moe")
+        expert_output = expert_output + shared_expert_output
 
         expert_output = expert_output.reshape(batch_size, sequence_length, hidden_dim)
         return expert_output

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -39,7 +39,6 @@ accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-at
 accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[fp8-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6367792)
 accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[nvfp4-1-attn_dp_off-trtllm] SKIP (temporary ToT main waive; B200 AutoDeploy NVFP4 GSM8K accuracy below threshold)
 accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[nvfp4-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6367792)
-accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_nvfp4[8] SKIP (https://nvbugs/6412108)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/6379333)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/6281818)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] SKIP (https://nvbugs/6281818)