add a new API to KV cache coordinator

KuntaiDu · KuntaiDu · commit f2085d90c30a · 2025-09-03T00:03:30.000Z
Signed-off-by: KuntaiDu &lt;kuntai@uchicago.edu&gt;
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
@@ -73,6 +73,45 @@ def get_num_blocks_to_allocate(self, request_id: str, num_tokens: int,
                     request_id, num_tokens, new_computed_blocks[i])
         return num_blocks_to_allocate
 
+    def get_num_blocks_to_allocate_for_connector(
+            self, request_id: str, num_tokens: int,
+            num_connector_prefix_tokens: int,
+            new_computed_blocks: tuple[list[KVCacheBlock],
+                                       ...], num_encoder_tokens: int) -> int:
+        """
+        Get the # of blocks to allocate for request when using connector.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including 
+                tokens that are already allocated).
+            num_connector_prefix_tokens: The number of tokens that hits 
+                the prefix cache inside connector.
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix caching.
+            num_encoder_tokens: The number of encoder tokens for allocating
+                blocks for cross-attention.
+
+        Returns:
+            The number of blocks.
+        """
+        num_blocks_to_allocate = 0
+        for i, manager in enumerate(self.single_type_managers):
+            if isinstance(manager, CrossAttentionManager):
+                # Cross-attention does not support prefix cache
+                # from connector yet.
+                num_blocks_to_allocate += \
+                    manager.get_num_blocks_to_allocate_for_connector(
+                    request_id, num_encoder_tokens, 0, [])
+            else:
+                num_blocks_to_allocate += \
+                    manager.get_num_blocks_to_allocate_for_connector(
+                    request_id,
+                    num_tokens,
+                    num_connector_prefix_tokens,
+                    new_computed_blocks[i])
+        return num_blocks_to_allocate
+
     def save_new_computed_blocks(
             self, request_id: str,
             new_computed_blocks: tuple[list[KVCacheBlock], ...]) -> None: