Repair shared expert dp

zengzengran · cml · commit cb59a6ea0b43 · 2025-11-26T20:48:53.000+08:00
Signed-off-by: zengran &lt;zengran2@huawei.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -70,8 +70,8 @@ def __init__(self, vllm_config):
         ) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
         if self.enable_shared_expert_dp:
             from vllm_ascend.utils import enable_sp
-            assert enable_sp(
-                vllm_config), "shared_expert_dp requires enable_sp=True."
+            assert enable_sp(vllm_config=vllm_config,
+                             enable_shared_expert_dp=True)
         self.multistream_overlap_shared_expert = additional_config.get(
             "multistream_overlap_shared_expert", False)
         self.recompute_scheduler_enable = additional_config.get(
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -37,6 +37,8 @@
 
 if prefill_context_parallel_enable():
     from vllm.distributed import get_pcp_group
+if shared_expert_dp_enabled():
+    from vllm.distributed import get_tensor_model_parallel_world_size
 
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.torch_utils import set_default_torch_dtype
@@ -298,6 +300,10 @@ def dummy_run(self,
                 self.model(input_ids=input_ids,
                            positions=positions,
                            hidden_states=previous_hidden_states)
+                if self.enable_shared_expert_dp:
+                    positions = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(positions,True)
+                    previous_hidden_states = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
+                        previous_hidden_states,True)
                 forward_context = get_forward_context()
                 if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \
                     not forward_context.capturing:
@@ -690,6 +696,12 @@ def _propose(
                                 (self.num_speculative_tokens + 1))
             batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
                                                uniform_decode=uniform_decode)
+        # Enabling sp/shared_expert_dp will perform educe_scatter operation.
+            if self.enable_shared_expert_dp:
+                tp_world_size = get_tensor_model_parallel_world_size()
+                reduce_num_input_tokens = num_input_tokens // tp_world_size
+                batch_descriptor = BatchDescriptor(num_tokens=reduce_num_input_tokens,
+                                                   uniform_decode=uniform_decode)
         else:
             batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
                                                uniform_decode=False)
@@ -741,12 +753,15 @@ def _propose(
                         positions = torch.ops.vllm.maybe_pad_and_reduce(
                             positions)
                         positions = positions.squeeze(-1)
+                        hidden_states = torch.ops.vllm.maybe_pad_and_reduce(
+                            hidden_states)
 
                     hidden_states = self.model(input_ids=input_ids,
                                                positions=positions,
                                                hidden_states=hidden_states)
                     hidden_states = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
                         hidden_states.contiguous(), True)
+
                     forward_context = get_forward_context()
                     if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
                         if self.vllm_config.model_config.use_mla:
@@ -821,20 +836,20 @@ def _propose(
                             batch_size,
                             attn_metadata_i.decode.actual_seq_lengths_q)
                 attn_metadata_i.decode.cos = builder.cos_cache[
-                    positions].unsqueeze(1).unsqueeze(2)
+                    positions[:batch_size]].unsqueeze(1).unsqueeze(2)
                 attn_metadata_i.decode.sin = builder.sin_cache[
-                    positions].unsqueeze(1).unsqueeze(2)
+                    positions[:batch_size]].unsqueeze(1).unsqueeze(2)
             # NOTE(woosuk): We should handle the case where the draft model
             # generates tokens beyond the max model length. Since it is complex
             # to remove such requests from the batch, we keep them in the batch
             # but adjust the position ids and slot mappings to avoid the
             # out-of-range access during the model execution. The draft tokens
             # generated with this adjustment should be ignored.
-            exceeds_max_model_len = positions >= self.runner.model_config.max_model_len
+            exceeds_max_model_len = positions[:batch_size] >= self.runner.model_config.max_model_len
             # Mask out the position ids that exceed the max model length.
             # Otherwise, we may get out-of-range error in RoPE.
             clamped_positions = torch.where(exceeds_max_model_len, 0,
-                                            positions)
+                                            positions[:batch_size])
             # Increment the sequence lengths.
             attn_metadata_i.seq_lens[:batch_size] += 1
             # For the requests that exceed the max model length, we set the
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -751,7 +751,8 @@ def dense_optim_enable() -> bool:
     return envs_ascend.VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE
 
 
-def enable_sp(vllm_config=None) -> bool:
+def enable_sp(vllm_config = None,
+              enable_shared_expert_dp: bool = False) -> bool:
     global _ENABLE_SP
     if _ENABLE_SP is None:
         if vllm_config is None:
@@ -765,6 +766,12 @@ def enable_sp(vllm_config=None) -> bool:
             # We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
             or bool(int(os.getenv("VLLM_ASCEND_ENABLE_FLASHCOMM", '0'))))
 
+        if not _ENABLE_SP and enable_shared_expert_dp:
+            _ENABLE_SP = True
+            logger.info(
+                f"shared_expert_dp requires enable_sp = True. has set enable_sp to True"
+            )
+
         if not _ENABLE_SP:
             return _ENABLE_SP