Modify DispatchGmmCombine python api and add new test

wangqiankun13 · wangqiankun13 · commit 77651123ec44 · 2025-11-30T23:53:05.000+08:00
Signed-off-by: wangqiankun13 &lt;wangqiankun13@huawei.com&gt;
diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp
@@ -594,11 +594,11 @@ std::tuple<at::Tensor, at::Tensor> dispatch_gmm_combine_decode(
     const at::Tensor &gmm1_permuted_weight_scale,
     const at::Tensor &gmm2_weight,
     const at::Tensor &gmm2_weight_scale,
-    const at::Tensor &expert_smooth_scales_optional,
-    const at::Tensor &expert_scales_optional,
-    c10::string_view hcom_ep_name,
-    int64_t num_ranks,
-    int64_t rank,
+    const c10::optional<at::Tensor> &expert_smooth_scales,
+    const c10::optional<at::Tensor> &expert_scales,
+    c10::string_view group_ep,
+    int64_t ep_rank_size,
+    int64_t ep_rank_id,
     int64_t moe_expert_num,
     int64_t shared_expert_num,
     int64_t shared_expert_rank_num,
@@ -611,11 +611,11 @@ std::tuple<at::Tensor, at::Tensor> dispatch_gmm_combine_decode(
 
     at::Tensor output = at::empty({bs, h}, x.options());
 
-    bool is_shared_expert = (rank < shared_expert_rank_num);
-    int64_t num_local_experts = is_shared_expert ? 1 : moe_expert_num / (num_ranks - shared_expert_rank_num);
-    at::Tensor ep_recv_count = at::empty({num_local_experts * num_ranks}, expert_ids.options());
+    bool is_shared_expert = (ep_rank_id < shared_expert_rank_num);
+    int64_t num_local_experts = is_shared_expert ? 1 : moe_expert_num / (ep_rank_size - shared_expert_rank_num);
+    at::Tensor ep_recv_count = at::empty({num_local_experts * ep_rank_size}, expert_ids.options());
 
-    vector<char> group_ep_chrs(hcom_ep_name.begin(), hcom_ep_name.end());
+    vector<char> group_ep_chrs(group_ep.begin(), group_ep.end());
     group_ep_chrs.push_back('\0');
     char *group_ep_ptr = &group_ep_chrs[0];
     EXEC_NPU_CMD(
@@ -628,12 +628,12 @@ std::tuple<at::Tensor, at::Tensor> dispatch_gmm_combine_decode(
         gmm1_permuted_weight_scale,
         gmm2_weight,
         gmm2_weight_scale,
-        expert_smooth_scales_optional,
-        expert_scales_optional,
+        expert_smooth_scales,
+        expert_scales,
         //input attrs
         group_ep_ptr,
-        num_ranks,
-        rank,
+        ep_rank_size,
+        ep_rank_id,
         moe_expert_num,
         shared_expert_num,
         shared_expert_rank_num,
@@ -719,12 +719,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
         "dispatch_gmm_combine_decode(Tensor x, Tensor expert_ids, Tensor gmm1_permuted_weight,"
         "                            Tensor gmm1_permuted_weight_scale,"
         "                            Tensor gmm2_weight, Tensor gmm2_weight_scale,"
-        "                            Tensor expert_smooth_scales_optional, Tensor expert_scales_optional,"
-        "                            str hcom_ep_name,"
-        "                            int num_ranks, int rank, int moe_expert_num,"
-        "                            int shared_expert_num, int shared_expert_rank_num,"
-        "                            int quant_mode,"
-        "                            int global_bs) -> (Tensor output, Tensor ep_recv_count)"
+        "                            Tensor? expert_smooth_scales=None, Tensor? expert_scales=None,"
+        "                            str group_ep='',"
+        "                            int ep_rank_size=0, int ep_rank_id=0, int moe_expert_num=0,"
+        "                            int shared_expert_num=1, int shared_expert_rank_num=0,"
+        "                            int quant_mode=0,"
+        "                            int global_bs=0) -> (Tensor output, Tensor ep_recv_count)"
     );
     ops.impl("dispatch_gmm_combine_decode", torch::kPrivateUse1, &vllm_ascend::dispatch_gmm_combine_decode);
 }
diff --git a/csrc/torch_binding_meta.cpp b/csrc/torch_binding_meta.cpp
@@ -158,11 +158,11 @@ std::tuple<at::Tensor, at::Tensor> dispatch_gmm_combine_decode_meta(
     const at::Tensor &gmm1_permuted_weight_scale,
     const at::Tensor &gmm2_weight,
     const at::Tensor &gmm2_weight_scale,
-    const at::Tensor &expert_smooth_scales_optional,
-    const at::Tensor &expert_scales_optional,
-    c10::string_view hcom_ep_name,
-    int64_t num_ranks,
-    int64_t rank,
+    const c10::optional<at::Tensor> &expert_smooth_scales,
+    const c10::optional<at::Tensor> &expert_scales,
+    c10::string_view group_ep,
+    int64_t ep_rank_size,
+    int64_t ep_rank_id,
     int64_t moe_expert_num,
     int64_t shared_expert_num,
     int64_t shared_expert_rank_num,
@@ -175,9 +175,9 @@ std::tuple<at::Tensor, at::Tensor> dispatch_gmm_combine_decode_meta(
 
     at::Tensor output = at::empty({bs, h}, x.options().device(at::kMeta));
 
-    bool is_shared_expert = (rank < shared_expert_rank_num);
-    int64_t num_local_experts = is_shared_expert ? 1 : moe_expert_num / (num_ranks - shared_expert_rank_num);
-    at::Tensor ep_recv_count = at::empty({num_local_experts * num_ranks}, expert_ids.options().device(at::kMeta));
+    bool is_shared_expert = (ep_rank_id < shared_expert_rank_num);
+    int64_t num_local_experts = is_shared_expert ? 1 : moe_expert_num / (ep_rank_size - shared_expert_rank_num);
+    at::Tensor ep_recv_count = at::empty({num_local_experts * ep_rank_size}, expert_ids.options().device(at::kMeta));
 
     return {output, ep_recv_count};
 }
diff --git a/tests/e2e/nightly/ops/test_dispatch_gmm_combine_decode.py b/tests/e2e/nightly/ops/test_dispatch_gmm_combine_decode.py