fixbug

GuoRen868 · GuoRen868 · commit bb712ab93e7e · 2025-11-10T20:23:29.000+08:00
diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp
@@ -527,12 +527,12 @@ std::tuple<at::Tensor, at::Tensor> fused_deep_moe(const at::Tensor &x, const at:
                                             const at::Tensor &gmm1_permuted_weight,
                                             const at::Tensor &gmm1_permuted_weight_scale,
                                             const at::Tensor &gmm2_weight, const at::Tensor &gmm2_weight_scale,
+                                            const at::Tensor &expert_smooth_scales_optional,
                                             const at::Tensor &expert_scales_optional,
-                                            c10::optional<c10::string_view> hcom_ep_name,
-                                            int64_t num_ranks, int64_t rank,
+                                            c10::string_view hcom_ep_name,
+                                            int64_t num_ranks, int64_t rank, int64_t moe_expert_num,
                                             int64_t shared_expert_num, int64_t shared_expert_rank_num,
-                                            int64_t num_experts, int64_t global_bs,
-                                            int64_t quant_mode)
+                                            int64_t quant_mode, int64_t global_bs)
 {
     auto x_shape = x.sizes();
     auto experts_shape = expert_ids.sizes();
@@ -542,15 +542,18 @@ std::tuple<at::Tensor, at::Tensor> fused_deep_moe(const at::Tensor &x, const at:
     at::Tensor output = at::empty({bs, h}, x.options());
 
     bool is_shared_expert = (rank < shared_expert_rank_num);
-    int64_t num_local_experts = is_shared_expert ? 1 : num_experts / (num_ranks - shared_expert_rank_num);
+    int64_t num_local_experts = is_shared_expert ? 1 : moe_expert_num / (num_ranks - shared_expert_rank_num);
     at::Tensor ep_recv_count = at::empty({num_local_experts * num_ranks}, expert_ids.options());
 
+    vector<char> group_ep_chrs(hcom_ep_name.begin(), hcom_ep_name.end());
+    group_ep_chrs.push_back('\0');
+    char *group_ep_ptr = &group_ep_chrs[0];
     EXEC_NPU_CMD(aclnnFusedDeepMoe,
                 // input
                 x, expert_ids, gmm1_permuted_weight, gmm1_permuted_weight_scale, gmm2_weight,
-                gmm2_weight_scale, static_cast<const std::nullptr_t &>(nullptr), expert_scales_optional,
+                gmm2_weight_scale, expert_smooth_scales_optional, expert_scales_optional,
                 //attr
-                hcom_ep_name, num_ranks, rank, num_experts, shared_expert_num, shared_expert_rank_num, quant_mode,
+                group_ep_ptr, num_ranks, rank, moe_expert_num, shared_expert_num, shared_expert_rank_num, quant_mode,
                 global_bs,
                 // output
                 output, ep_recv_count);
@@ -619,12 +622,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
     "fused_deep_moe(Tensor x, Tensor expert_ids, Tensor gmm1_permuted_weight,"
     "                Tensor gmm1_permuted_weight_scale,"
     "                Tensor gmm2_weight, Tensor gmm2_weight_scale,"
-    "                Tensor expert_scales_optional,"
-    "                str? hcom_ep_name,"
-    "                int num_ranks, int rank,"
+    "                Tensor expert_smooth_scales_optional, Tensor expert_scales_optional,"
+    "                str hcom_ep_name,"
+    "                int num_ranks, int rank, int moe_expert_num,"
     "                int shared_expert_num, int shared_expert_rank_num,"
-    "                int num_experts, int global_bs,"
-    "                int quant_mode) -> (Tensor output, Tensor ep_recv_count)"
+    "                int quant_mode,"
+    "                int global_bs) -> (Tensor output, Tensor ep_recv_count)"
     );
 
     ops.impl("fused_deep_moe", torch::kPrivateUse1, &vllm_ascend::fused_deep_moe);
diff --git a/csrc/torch_binding_meta.cpp b/csrc/torch_binding_meta.cpp
@@ -73,12 +73,12 @@ std::tuple<at::Tensor, at::Tensor> fused_deep_moe_meta(const at::Tensor &x, cons
                                             const at::Tensor &gmm1_permuted_weight,
                                             const at::Tensor &gmm1_permuted_weight_scale,
                                             const at::Tensor &gmm2_weight, const at::Tensor &gmm2_weight_scale,
+                                            const at::Tensor &expert_smooth_scales_optional,
                                             const at::Tensor &expert_scales_optional,
-                                            c10::optional<c10::string_view> hcom_ep_name,
-                                            int64_t num_ranks, int64_t rank,
+                                            c10::string_view hcom_ep_name,
+                                            int64_t num_ranks, int64_t rank, int64_t moe_expert_num,
                                             int64_t shared_expert_num, int64_t shared_expert_rank_num,
-                                            int64_t num_experts, int64_t global_bs,
-                                            int64_t quant_mode)
+                                            int64_t quant_mode, int64_t global_bs)
 {
     auto x_shape = x.sizes();
     auto experts_shape = expert_ids.sizes();
@@ -88,7 +88,7 @@ std::tuple<at::Tensor, at::Tensor> fused_deep_moe_meta(const at::Tensor &x, cons
     at::Tensor output = at::empty({bs, h}, x.options().device(at::kMeta));
 
     bool is_shared_expert = (rank < shared_expert_rank_num);
-    int64_t num_local_experts = is_shared_expert ? 1 : num_experts / (num_ranks - shared_expert_rank_num);
+    int64_t num_local_experts = is_shared_expert ? 1 : moe_expert_num / (num_ranks - shared_expert_rank_num);
     at::Tensor ep_recv_count = at::empty({num_local_experts * num_ranks}, expert_ids.options().device(at::kMeta));
 
     return {output, ep_recv_count};