support both eplb and microbatch simultaneously (#3591)

zhaochaoxing · web-flow · commit 90f3209f7c10 · 2025-05-28T21:37:40.000+08:00
diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py
@@ -577,7 +577,11 @@ def forward(
 class MoEGate(nn.Module):
     """Deepseek Gate."""
 
-    def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device = None):
+    def __init__(self,
+                 config: Any,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None,
+                 info: eplb.EPLBDispatchInfo = None):
         super().__init__()
         self.config = config
         self.top_k = config.num_experts_per_tok
@@ -602,6 +606,7 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
         self.softmax_topk = SoftmaxTopK(self.top_k)
 
         self.fake_eplb = getenv('LMDEPLOY_FAKE_EPLB', 'False').lower() == 'true'
+        self.eplb_dispatch_info = info
 
     def _compute_scores(self, logits: torch.Tensor):
         """compute scores."""
@@ -665,6 +670,9 @@ def forward(self, hidden_states: torch.Tensor):
         if not self.renormalize or self.topk_method == 'noaux_tc':
             topk_weight = topk_weight * self.routed_scaling_factor
 
+        if self.eplb_dispatch_info is not None:
+            topk_idx = eplb.topk_ids_logical_to_physical(topk_idx, self.eplb_dispatch_info)
+
         return topk_weight, topk_idx
 
 
@@ -685,18 +693,19 @@ def __init__(self, config: Any, layer_idx, dtype: torch.dtype = None, device: to
         self.n_group = config.n_group
         self.topk_group = config.topk_group
 
-        self.gate = MoEGate(config, dtype=dtype, device=device)
-
         dist_ctx = get_dist_manager().current_context()
         dp = dist_ctx.dp
         world_size = dist_ctx.world_size
         moe_all_reduce = dp > 1 and dist_ctx.tp > 1
         if get_dist_manager().current_context().dist_config.enable_eplb:
-            self.eplb_dispatch_info = eplb.EPLBDispatchInfo.init_new(
+            eplb_dispatch_info = eplb.EPLBDispatchInfo.init_new(
                 ep_rank=dist_ctx.ep_rank,
                 layer_idx=layer_idx,
             )
             self.num_experts = eplb.get_global_eplb_metadata().num_physical_experts()
+            self.gate = MoEGate(config, dtype=dtype, device=device, info=eplb_dispatch_info)
+        else:
+            self.gate = MoEGate(config, dtype=dtype, device=device, info=None)
         self.experts = build_fused_moe(
             self.hidden_dim,
             self.ffn_dim,
@@ -730,9 +739,6 @@ def forward(self, hidden_states: torch.Tensor):
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         topk_weights, topk_ids = self.gate(hidden_states)
-        if get_dist_manager().current_context().dist_config.enable_eplb:
-            topk_ids = eplb.topk_ids_logical_to_physical(topk_ids, self.eplb_dispatch_info)
-
         out_states = self.experts(
             hidden_states,
             topk_weights,