PaddlePaddle
diff --git a/‎paddleformers/nn/moe_deepep/modular_moe_layer.py‎
Lines changed: 55 additions & 31 deletions b/‎paddleformers/nn/moe_deepep/modular_moe_layer.py‎
Lines changed: 55 additions & 31 deletions
diff --git a/‎paddleformers/nn/moe_deepep/moe_factory.py‎
Lines changed: 2 additions & 0 deletions b/‎paddleformers/nn/moe_deepep/moe_factory.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddleformers/nn/moe_deepep/moe_gate.py‎
Lines changed: 66 additions & 30 deletions b/‎paddleformers/nn/moe_deepep/moe_gate.py‎
Lines changed: 66 additions & 30 deletions
diff --git a/‎paddleformers/nn/moe_deepep/moe_loss.py‎
Lines changed: 21 additions & 0 deletions b/‎paddleformers/nn/moe_deepep/moe_loss.py‎
Lines changed: 21 additions & 0 deletions
@@ -29,6 +29,7 @@
 from .moe_communication import AllToAllMoECommunication, DeepEPMoECommunication
 from .moe_expert import StandardMLPExpert
 from .moe_gate import StandardMoEGate
+from .moe_loss import AddAuxiliaryLoss
 from .moe_loss_instance import get_global_loss_registry
 
 logger = logging.getLogger(__name__)
@@ -48,6 +49,7 @@ def __init__(
         moe_config: Dict,
         model_type: str,
         expert_class,
+        transpose_gate_weight: bool,
         pretrained_config: Optional[PretrainedConfig] = None,
     ):
 
@@ -61,23 +63,25 @@ def __init__(
         self.norm_topk_prob = norm_topk_prob
         self.model_type = model_type
         self.expert_class = expert_class
+        self.transpose_gate_weight = transpose_gate_weight
 
         self.sequence_parallel = pretrained_config.get("sequence_parallel", False)
         self.tensor_parallel_degree = pretrained_config.get("tensor_parallel_degree", 1)
         self.seq_length = pretrained_config.get("seq_length", pretrained_config.get("max_seq_len", 1024))
         self.fuse_up_gate = pretrained_config.get("fuse_attention_ffn", False)
         self.ep_communication_type = pretrained_config.get("ep_communication_type", "deepep")
+        self.n_group = pretrained_config.get("n_group", 1)
+        self.topk_group = pretrained_config.get("topk_group", 1)
+        self.routed_scaling_factor = pretrained_config.get("routed_scaling_factor", 1.0)
+        self.aux_loss_alpha = pretrained_config.get("aux_loss_alpha", 0.0)
+        self.moe_subbatch_token_num = pretrained_config.get("moe_subbatch_token_num", -1)
         try:
             moe_group = fleet.get_hybrid_communicate_group().get_expert_parallel_group()
         except Exception:
             moe_group = None
         self.expert_parallel_degree = dist.get_world_size(moe_group) if moe_group is not None else 1
 
-        self.custom_gate = moe_config.get("custom_gate", None)
-        self.custom_communication = moe_config.get("custom_communication", None)
         self.gate_activation = moe_config.get("gate_activation", "softmax")
-        self.aux_loss_weight = moe_config.get("aux_loss_weight", 0.01)
-        self.z_loss_weight = moe_config.get("z_loss_weight", 0.0)
         self.topk_method = (
             moe_config.get("train_topk_method", "greedy")
             if self.training
@@ -92,19 +96,23 @@ def __init__(
         self.loss_combiner_name = moe_config.get("loss_combiner_name", "weighted_sum")
 
         self._init_expert_parallel()
-        if self.custom_gate is not None:
-            self.gate = self.custom_gate
-        else:
-            self.gate = StandardMoEGate(
-                num_experts=self.num_experts,
-                expert_hidden_size=self.hidden_size,
-                drop_tokens=self.drop_tokens,
-                topk_method=self.topk_method,
-                num_experts_per_tok=self.num_experts_per_tok,
-                norm_topk_prob=self.norm_topk_prob,
-                moe_config=moe_config,
-                seq_length=self.seq_length,
-            )
+        self.gate = StandardMoEGate(
+            num_experts=self.num_experts,
+            expert_hidden_size=self.hidden_size,
+            drop_tokens=self.drop_tokens,
+            topk_method=self.topk_method,
+            num_experts_per_tok=self.num_experts_per_tok,
+            norm_topk_prob=self.norm_topk_prob,
+            moe_config=moe_config,
+            seq_length=self.seq_length,
+            n_group=self.n_group,
+            topk_group=self.topk_group,
+            routed_scaling_factor=self.routed_scaling_factor,
+            moe_subbatch_token_num=self.moe_subbatch_token_num,
+            tensor_parallel_degree=self.tensor_parallel_degree,
+            sequence_parallel=self.sequence_parallel,
+            transpose_gate_weight=self.transpose_gate_weight,
+        )
 
         if self.expert_class is None:
             self.expert_class = StandardMLPExpert
@@ -124,8 +132,14 @@ def __init__(
         if self.model_type == "qwen3_moe":
             pass
         elif self.model_type == "glm4_moe":
-            pass
-        self.experts = nn.LayerList([self.expert_class(**expert_args) for _ in range(self.num_experts)])
+            expert_args["fuse_up_gate"] = self.fuse_up_gate
+
+        self.experts = nn.LayerList([])
+        for i in range(self.num_experts):
+            if i // self.num_experts_per_device == self.moe_rank:
+                self.experts.append(self.expert_class(**expert_args))
+            else:
+                self.experts.append(None)
 
         if self.expert_parallel_degree > 1:
             self.token_dispatcher = MoEFlexTokenDispatcher(
@@ -137,22 +151,25 @@ def __init__(
         shared_expert_args = {}
         shared_expert_args["config"] = shared_expert_pretrained_config
         shared_expert_args["intermediate_size"] = self.moe_intermediate_size * self.num_shared_experts
+        # Add more arguments for different models
+        if self.model_type == "qwen3_moe":
+            pass
+        elif self.model_type == "glm4_moe":
+            shared_expert_args["fuse_up_gate"] = self.fuse_up_gate
+
         if self.num_shared_experts > 0:
             self.shared_experts = self.expert_class(**shared_expert_args)
         else:
             self.shared_experts = None
 
-        if self.custom_communication is not None:
-            self.communication = self.custom_communication
+        if self.ep_communication_type == "deepep":
+            self.communication = DeepEPMoECommunication()
+        elif self.ep_communication_type == "alltoall":
+            self.communication = AllToAllMoECommunication()
         else:
-            if self.ep_communication_type == "deepep":
-                self.communication = DeepEPMoECommunication()
-            elif self.ep_communication_type == "alltoall":
-                self.communication = AllToAllMoECommunication()
-            else:
-                raise ValueError(
-                    f"Unsupported communication type: {self.ep_communication_type}, please choose from ['deepep', 'alltoall']"
-                )
+            raise ValueError(
+                f"Unsupported communication type: {self.ep_communication_type}, please choose from ['deepep', 'alltoall']"
+            )
 
         if hasattr(dist, "fleet") and dist.is_initialized() and self.expert_parallel_degree > 1:
             self.is_mp_moe = False
@@ -224,6 +241,9 @@ def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
         capacity, topk_weights, topk_indices, gates_masked, mask, priorities, aux_loss, z_loss = self.gate(
             hidden_states
         )
+        # topk_weights, topk_indices will be used in AllToAllMoECommunication
+        # gates_masked, mask will be used in DeepEPMoECommunication
+        # capacity, priorities are not used currently
 
         if self.expert_parallel_degree > 1:
             output = self._forward_with_ep_parallel(
@@ -237,16 +257,20 @@ def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
                 reshaped_input = hidden_states
             output = self._forward_traditional_moe(reshaped_input, topk_indices, topk_weights)
 
-        output = output.reshape(orig_shape)
+        if self.training and self.aux_loss_alpha > 0.0:
+            aux_loss = aux_loss * self.aux_loss_alpha
+            output = AddAuxiliaryLoss.apply(output, aux_loss)
 
         if self.shared_experts is not None:
             shared_output = self.shared_experts(residuals)
             output = output + shared_output
 
+        output = output.reshape(orig_shape)
+
         if self.expert_parallel_degree <= 1 and self.sequence_parallel:
             output = ScatterOp.apply(output)
 
-        return output, aux_loss
+        return output
 
     def _forward_traditional_moe(
         self, hidden_states: paddle.Tensor, selected_experts: paddle.Tensor, topk_weights: paddle.Tensor
 
@@ -26,6 +26,7 @@ def create_from_model_name(
         train_topk_method: str,
         inference_topk_method: str,
         drop_tokens: bool,
+        transpose_gate_weight: bool,
     ) -> ModularMoELayer:
         model_type = getattr(pretrained_config, "model_type", None)
         if model_type is None:
@@ -55,6 +56,7 @@ def create_from_model_name(
             moe_config=moe_config,
             model_type=model_type,
             expert_class=expert_class,
+            transpose_gate_weight=transpose_gate_weight,
             pretrained_config=pretrained_config,
         )
 
 
@@ -29,26 +29,26 @@
 
 class MoEGateMixin:
     def gate_score_func(self, logits: paddle.Tensor) -> paddle.Tensor:
+        # [..., hidden_dim] -> [..., num_experts]
         with paddle.amp.auto_cast(False):
-            # [..., hidden_dim] -> [..., num_experts]
             scoring_func = getattr(self, "scoring_func", None)
             if scoring_func == "softmax":
-                scores = F.softmax(logits, axis=-1)
+                scores = F.softmax(logits.cast("float32"), axis=-1)
             elif scoring_func == "sigmoid":
-                scores = F.sigmoid(logits)
+                scores = F.sigmoid(logits.cast("float32"))
             elif scoring_func == "tanh":
-                scores = F.tanh(logits)
+                scores = F.tanh(logits.cast("float32"))
             elif scoring_func == "relu":
-                scores = F.relu(logits)
+                scores = F.relu(logits.cast("float32"))
             elif scoring_func == "gelu":
-                scores = F.gelu(logits)
+                scores = F.gelu(logits.cast("float32"))
             elif scoring_func == "leaky_relu":
-                scores = F.leaky_relu(logits)
+                scores = F.leaky_relu(logits.cast("float32"))
             else:
                 logger.warning_once(
                     f"insupportable scoring function for MoE gating: {scoring_func}, use softmax instead"
                 )
-                scores = F.softmax(logits, axis=-1)
+                scores = F.softmax(logits.cast("float32"), axis=-1)
         return scores
 
     def gumbel_rsample(self, logits: paddle.Tensor) -> paddle.Tensor:
@@ -130,9 +130,7 @@ def _cal_aux_loss(self, gates, mask):
         aux_loss = paddle.sum(me * ce) * float(self.num_experts)
         return aux_loss
 
-    def _cal_seq_aux_loss(self, probs, top_k, routing_map, seq_length):
-        max_seq_len = seq_length
-
+    def _cal_seq_aux_loss(self, probs, top_k, routing_map, max_seq_len):
         sub_max_seq_len = max_seq_len
         if hasattr(self, "moe_subbatch_token_num") and self.moe_subbatch_token_num > 0:
             sub_max_seq_len = self.moe_subbatch_token_num * self.tensor_parallel_degree
@@ -162,7 +160,6 @@ def _cal_seq_aux_loss(self, probs, top_k, routing_map, seq_length):
         )
         # [B, E] -> [B] -> []
         seq_aux_loss = (cost_coeff * all_probs.sum(axis=seq_axis) / max_seq_len).sum(axis=1).mean()
-
         return seq_aux_loss
 
     def _cal_z_loss(self, logits) -> paddle.Tensor:
@@ -361,6 +358,9 @@ def _topk_noaux_tc(
         )  # [n, e]
         tmp_scores = scores_for_choice * score_mask  # [n, e]
         topk_weight, topk_idx = paddle.topk(tmp_scores, k=k, axis=-1, sorted=True)
+
+        # The bias term b is used only to adjust affinity scores for Top-K expert selection (routing); it does not affect gating.
+        # The gate applied during dispatch and to weight the FFN output is computed from the original affinity score s_{i,t} (without the bias).
         topk_weight = scores.take_along_axis(topk_idx, axis=1) if not self.training else topk_weight
 
         return topk_weight, topk_idx
@@ -378,6 +378,13 @@ def __init__(
         norm_topk_prob: bool,
         moe_config: Dict,
         seq_length: int,
+        n_group: int,
+        topk_group: int,
+        routed_scaling_factor: float,
+        moe_subbatch_token_num: int,
+        tensor_parallel_degree: int,
+        sequence_parallel: bool,
+        transpose_gate_weight: bool,
     ):
         super(StandardMoEGate, self).__init__()
 
@@ -390,8 +397,15 @@ def __init__(
         # force keep in float32 when using amp
         self._cast_to_low_precision = False
         self.seq_length = seq_length
-
-        self.scoring_func = moe_config.get("scoring_func", "softmax")
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.routed_scaling_factor = routed_scaling_factor
+        self.moe_subbatch_token_num = moe_subbatch_token_num
+        self.tensor_parallel_degree = tensor_parallel_degree
+        self.sequence_parallel = sequence_parallel
+        self.transpose_gate_weight = transpose_gate_weight
+
+        self.scoring_func = moe_config.get("gate_activation", "softmax")
         self.capacity_factor = moe_config.get("capacity_factor", 1.0)
         self.eval_capacity_factor = moe_config.get("eval_capacity_factor", 1.0)
         self.min_capacity = moe_config.get("min_capacity", 1)
@@ -401,26 +415,45 @@ def __init__(
         self.use_rts = moe_config.get("use_rts", True)
         self.top2_2nd_expert_sampling = moe_config.get("top2_2nd_expert_sampling", True)
         self.drop_policy = moe_config.get("drop_policy", "probs")
-        self.n_group = moe_config.get("n_group", 1)  # for group_limited_greedy
-        self.topk_group = moe_config.get("topk_group", 1)  # for group_limited_greedy
-        self.routed_scaling_factor = moe_config.get("routed_scaling_factor", 1.0)
-        self.seq_aux = moe_config.get("seq_aux", False)
+        self.seq_aux = moe_config.get("seq_aux", True)
 
         if self.global_aux_loss:
             assert self.group is not None, "group is required when global_aux_loss is True"
             self.rank = dist.get_rank(self.group)
 
-        self.weight = paddle.create_parameter(
-            shape=[self.expert_hidden_size, self.num_experts],
-            dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(),
-        )
+        # Accordding to the shape of gate weights in model checkpoint
+        if not transpose_gate_weight:
+            self.weight = paddle.create_parameter(
+                shape=[self.expert_hidden_size, self.num_experts],
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Uniform(),
+            )
+        else:
+            self.weight = paddle.create_parameter(
+                shape=[self.num_experts, self.expert_hidden_size],
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Uniform(),
+            )
+
+        if self.topk_method == "noaux_tc":
+            self.register_buffer("e_score_correction_bias", paddle.zeros((self.num_experts,), dtype=paddle.float32))
+            self._cast_to_low_precision = False
+            self.expert_usage = paddle.zeros(
+                shape=[self.num_experts],
+                dtype=paddle.int64,
+            )  # Used in MoECorrectionBiasAdjustCallback
+            self.expert_usage.stop_gradient = True
 
     def forward(
         self,
         gates: paddle.Tensor,
     ) -> Tuple[int, paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
-        return self.topkgating(gates)
+        capacity, top_gate, top_idx, gates_masked, mask, token_priority, l_aux, l_zloss = self.topkgating(gates)
+        exp_counts = paddle.sum(mask.cast(paddle.int64), axis=0)
+        if self.topk_method == "noaux_tc":
+            with paddle.no_grad():
+                self.expert_usage += exp_counts
+        return capacity, top_gate, top_idx, gates_masked, mask, token_priority, l_aux, l_zloss
 
     def topkgating(
         self,
@@ -434,14 +467,19 @@ def topkgating(
         elif len(gates.shape) == 2:
             batch_size_seq_len, d_model = gates.shape
 
+        with paddle.amp.auto_cast(False):
+            gates = gates.cast(self.weight.dtype)
+            if not self.transpose_gate_weight:
+                logits = F.linear(gates.cast("float32"), self.weight.cast("float32"))
+            else:
+                logits = F.linear(gates.cast("float32"), self.weight.cast("float32").t())
+            gates = self.gate_score_func(logits=logits)
+            gates = gates.cast(paddle.float32)
+
         gates_ori = gates
         if self.scoring_func == "sigmoid":
             gates_ori = gates_ori / (gates_ori.sum(axis=-1, keepdim=True) + 1e-20)
 
-        logits = F.linear(gates, self.weight)
-
-        gates = self.gate_score_func(logits=logits)
-
         l_zloss = self._cal_z_loss(gates)
 
         if self.topk_method == "greedy":
@@ -506,9 +544,7 @@ def topkgating(
         denom_s = paddle.clip(gates_s, min=paddle.finfo(gates_masked.dtype).eps)
         if self.norm_topk_prob:
             gates_masked = gates_masked / denom_s
-        gates_masked = gates_masked.to(gates.dtype)
         gates_masked *= self.routed_scaling_factor
-
         return (
             capacity,  # new capacity
             top_gate,  # weights of selected experts for each token [num_tokens, num_experts_per_token]
 
@@ -58,6 +58,27 @@ def __call__(
         pass
 
 
+class AddAuxiliaryLoss(paddle.autograd.PyLayer):
+    """
+    The trick function of adding auxiliary (aux) loss,
+    which includes the gradient of the aux loss during backpropagation.
+    """
+
+    @staticmethod
+    def forward(ctx, x, loss):
+        assert paddle.numel(loss) == 1
+        ctx.dtype = loss.dtype
+        ctx.required_aux_loss = not loss.stop_gradient
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_loss = None
+        if ctx.required_aux_loss:
+            grad_loss = paddle.ones(1, dtype=ctx.dtype)
+        return grad_output, grad_loss
+
+
 class LossCombiner(Protocol):
     def __call__(self, losses: Dict[str, paddle.Tensor], configs: Dict[str, LossConfig]) -> paddle.Tensor:
         pass