NVIDIA · leslie-fang25 · Feb 10, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 6, 2026
@@ -283,6 +283,10 @@ def combine(
             self.expert_size_per_partition, num_tokens_per_expert, self.hidden_size
         )
 
+        if deep_ep_topk_weights.dtype != torch.float32:
+            # Deep ep low latency combine requires for fp32 weights
+            deep_ep_topk_weights = deep_ep_topk_weights.to(torch.float32)
+
         if self.use_low_precision_combine:
             if self._has_nvfp4():
                 precision = "nvfp4"

@@ -756,7 +756,10 @@ def _forward_chunk_impl(
             if self.enable_dummy_allreduce:
                 self.dummy_allreduce()
             # Use unified combine interface (reads dispatch state from strategy)
-            final_hidden_states = self.comm.combine(final_hidden_states)
+            all_rank_max_num_tokens = max(all_rank_num_tokens)
+            final_hidden_states = self.comm.combine(
+                final_hidden_states, all_rank_max_num_tokens=all_rank_max_num_tokens
+            )
         else:
             # For non-comm case, It should be attention TP or single rank.
             # only check if allreduce is needed

@@ -531,6 +531,10 @@ def run_moe(
 
         routing_bias = routing_bias if router_logits is not None else None
 
+        if token_selected_experts is not None:
+            # for cases like deepep low latency where fake top_k=1 might be used
+            top_k = token_selected_experts.shape[-1]
+
         # Ensure x_sf is 2D before flattening
         if x_sf is not None:
             assert len(