[P/D]kv_output_aggregator support P TP > D TP (vllm-project#23917)

LCAIZJ · leichao.lc · FeiDaLI · commit fafc80420a89 · 2025-09-25T18:54:17.000+08:00
Signed-off-by: LCAIZJ &lt;leichao139636@163.com&gt;
Co-authored-by: leichao.lc &lt;leichao.lc@antgroup.com&gt;
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -355,3 +355,14 @@ def get_required_kvcache_layout(
             raise TypeError("get_required_kvcache_layout should not be called "
                             "on the abstract base class")
         return None
+
+    def get_finished_count(self) -> Optional[int]:
+        """
+        Get the count of requests expected to complete send/receive operations
+        via this connector.
+
+        Returns:
+            int: expected sending or receiving completion count.
+        """
+
+        return None
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
@@ -13,6 +13,7 @@
 
 import vllm.platforms
 from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -54,6 +55,7 @@ def __init__(
         self._init_executor()
         self.is_sleeping = False
         self.sleeping_tags: set[str] = set()
+        self.kv_output_aggregator = None
 
     @abstractmethod
     def _init_executor(self) -> None:
@@ -252,6 +254,11 @@ async def check_health_async(self) -> None:
         exception."""
         self.check_health()
 
+    def init_kv_output_aggregator(self, finished_count: Optional[int]) -> None:
+        """Init KVOutputAggregator"""
+        self.kv_output_aggregator = KVOutputAggregator(
+            finished_count or self.parallel_config.world_size)
+
 
 class DistributedExecutorBase(ExecutorBase):
     """Abstract superclass of distributed executor implementations."""
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -128,6 +128,9 @@ def __init__(self,
             log_stats=self.log_stats,
         )
         self.use_spec_decode = vllm_config.speculative_config is not None
+        if self.scheduler.connector is not None:  # type: ignore
+            self.model_executor.init_kv_output_aggregator(
+                self.scheduler.connector.get_finished_count())  # type: ignore
 
         self.mm_registry = mm_registry = MULTIMODAL_REGISTRY
         self.mm_receiver_cache = engine_receiver_cache_from_config(
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
@@ -26,7 +26,6 @@
                               destroy_model_parallel)
 from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
-from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
                                              get_pp_group, get_tp_group)
 from vllm.executor.multiproc_worker_utils import (
@@ -135,8 +134,6 @@ def _init_executor(self) -> None:
 
         self.output_rank = self._get_output_rank()
         self.has_connector = self.vllm_config.kv_transfer_config is not None
-        self.kv_output_aggregator = KVOutputAggregator(
-            self.parallel_config.world_size)
 
     def start_worker_monitor(self):
         workers = self.workers
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
@@ -51,8 +51,6 @@ def _init_executor(self) -> None:
 
         # KV connector setup
         self.has_connector = self.vllm_config.kv_transfer_config is not None
-        self.kv_output_aggregator = KVOutputAggregator(
-            self.parallel_config.world_size)
 
     @property
     def max_concurrent_batches(self) -> int:

Original file line number	Diff line number	Diff line change
`@@ -128,6 +128,9 @@ def __init__(self,`
`128`	`128`	`log_stats=self.log_stats,`
`129`	`129`	`)`
`130`	`130`	`self.use_spec_decode = vllm_config.speculative_config is not None`
	`131`	`+ if self.scheduler.connector is not None: # type: ignore`
	`132`	`+ self.model_executor.init_kv_output_aggregator(`
	`133`	`+ self.scheduler.connector.get_finished_count()) # type: ignore`
`131`	`134`
`132`	`135`	`self.mm_registry = mm_registry = MULTIMODAL_REGISTRY`
`133`	`136`	`self.mm_receiver_cache = engine_receiver_cache_from_config(`