reserve blocks for dummy inputs (#4157)

grimoire · web-flow · commit 51cbd2c078d8 · 2025-12-04T21:48:22.000+08:00
* reserve blocks for dummy inputs

* fix sliding window

* limit session len

* remove comment
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -90,6 +90,9 @@ class CacheConfig:
     num_state_caches: int = None
     states_shapes: List[Tuple] = field(default_factory=list)
 
+    # reserved blocks for dummy inputs, init to 0 for unit test.
+    num_reserved_gpu_blocks: int = 0
+
     # For PD Disaggregation
     role: EngineRole = EngineRole.Hybrid
     migration_backend: MigrationBackend = MigrationBackend.DLSlime
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -113,17 +113,20 @@ def _build_scheduler_config(engine_config: PytorchEngineConfig):
 
 def _build_cache_config(engine_config: PytorchEngineConfig):
     """Build cache config."""
-    cache_config = CacheConfig(max_batches=engine_config.max_batch_size,
-                               block_size=engine_config.block_size,
-                               num_cpu_blocks=engine_config.num_cpu_blocks,
-                               num_gpu_blocks=engine_config.num_gpu_blocks,
-                               cache_max_entry_count=engine_config.cache_max_entry_count,
-                               max_prefill_token_num=engine_config.max_prefill_token_num,
-                               enable_prefix_caching=engine_config.enable_prefix_caching,
-                               quant_policy=engine_config.quant_policy,
-                               device_type=engine_config.device_type,
-                               migration_backend=engine_config.migration_backend,
-                               role=engine_config.role)
+    cache_config = CacheConfig(
+        max_batches=engine_config.max_batch_size,
+        block_size=engine_config.block_size,
+        num_cpu_blocks=engine_config.num_cpu_blocks,
+        num_gpu_blocks=engine_config.num_gpu_blocks,
+        cache_max_entry_count=engine_config.cache_max_entry_count,
+        max_prefill_token_num=engine_config.max_prefill_token_num,
+        enable_prefix_caching=engine_config.enable_prefix_caching,
+        quant_policy=engine_config.quant_policy,
+        device_type=engine_config.device_type,
+        migration_backend=engine_config.migration_backend,
+        role=engine_config.role,
+        # reserve 1 blocks for dummy input and padding
+        num_reserved_gpu_blocks=1)
     return cache_config
 
 
@@ -542,7 +545,8 @@ def _response(self, resp: Response, resp_type: ResponseType, data: Any = None, e
     def _get_max_session_len(self):
         """Get max session len."""
         session_len = self.scheduler_config.max_session_len
-        max_tokens = (self.cache_config.num_gpu_blocks * self.cache_config.block_size)
+        num_gpu_blocks = self.cache_config.num_gpu_blocks - self.cache_config.num_reserved_gpu_blocks
+        max_tokens = (num_gpu_blocks * self.cache_config.block_size)
         window_size = self.cache_config.window_size
         if window_size > 0 and window_size <= max_tokens:
             max_tokens = (1 << 63) - 1
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
@@ -406,8 +406,6 @@ def new(
         # seq_len + history_length
         kv_seqlens = q_seqlens + history_seqlens
         kv_seqlens -= inputs.num_ignored_history
-        if inputs.is_dummy:
-            kv_seqlens = torch.zeros_like(kv_seqlens)
 
         ret = StepContext(
             input_ids=inputs.input_ids,
diff --git a/lmdeploy/pytorch/paging/block_manager/__init__.py b/lmdeploy/pytorch/paging/block_manager/__init__.py
@@ -15,8 +15,12 @@ def build_block_manager(cache_config: CacheConfig) -> BaseBlockManager:
     num_cpu_blocks = cache_config.num_cpu_blocks
     num_gpu_blocks = cache_config.num_gpu_blocks
     window_size = cache_config.window_size
+    num_gpu_reserved = cache_config.num_reserved_gpu_blocks
 
     if window_size < 0:
-        return DefaultBlockManager(num_gpu_blocks, num_cpu_blocks)
+        return DefaultBlockManager(num_gpu_blocks, num_cpu_blocks, num_gpu_reserved=num_gpu_reserved)
     else:
-        return WindowBlockManager(num_gpu_blocks, num_cpu_blocks, window_size=window_size)
+        return WindowBlockManager(num_gpu_blocks,
+                                  num_cpu_blocks,
+                                  window_size=window_size,
+                                  num_gpu_reserved=num_gpu_reserved)
diff --git a/lmdeploy/pytorch/paging/block_manager/base_block_manager.py b/lmdeploy/pytorch/paging/block_manager/base_block_manager.py
@@ -28,31 +28,13 @@ def num_blocks(self):
         return self._num_blocks
 
 
-class PhysicalMemory:
-    """Physical memory blocks."""
-
-    def __init__(self, num_cpu_blocks: int, num_gpu_blocks: int) -> None:
-        self._num_cpu_blocks = num_cpu_blocks
-        self._num_gpu_blocks = num_gpu_blocks
-        self._num_blocks = num_cpu_blocks + num_gpu_blocks
-
-    def num_cpu_blocks(self):
-        """Get num cpu blocks."""
-        return self._num_cpu_blocks
-
-    def num_gpu_blocks(self):
-        """Get num gpu blocks."""
-        return self._num_gpu_blocks
-
-
 class PhysicalAllocator:
     """The physical block allocator.
 
     The allocator won't allocate real memory. It is used to support block manager.
     """
 
-    def __init__(self, memory: PhysicalMemory, num_blocks: int, offset: int = 0):
-        self._mem = memory
+    def __init__(self, num_blocks: int, offset: int = 0):
         self._num_blocks = num_blocks
         self._offset = offset
 
@@ -87,13 +69,13 @@ def get_num_free_blocks(self):
 class LogicalAllocator:
     """The logical block allocator."""
 
-    def __init__(self, num_cpu_blocks: int, num_gpu_blocks: int) -> None:
+    def __init__(self, num_cpu_blocks: int, num_gpu_blocks: int, num_gpu_reserved: int = 0) -> None:
         self._log_mem = LogicalMemory(num_cpu_blocks + num_gpu_blocks)
-        self._phy_mem = PhysicalMemory(num_cpu_blocks, num_gpu_blocks)
 
         self._cpu_mem_offset = num_gpu_blocks
-        self._gpu_allocator = PhysicalAllocator(self._phy_mem, num_gpu_blocks, 0)
-        self._cpu_allocator = PhysicalAllocator(self._phy_mem, num_cpu_blocks, self._cpu_mem_offset)
+        num_gpu_blocks -= num_gpu_reserved
+        self._gpu_allocator = PhysicalAllocator(num_gpu_blocks, num_gpu_reserved)
+        self._cpu_allocator = PhysicalAllocator(num_cpu_blocks, self._cpu_mem_offset)
 
         num_blocks = self._log_mem.num_blocks()
         self._num_blocks = num_blocks
@@ -225,11 +207,11 @@ class BaseBlockManager:
         num_cpu_blocks (int): number of cpu blocks.
     """
 
-    def __init__(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
+    def __init__(self, num_gpu_blocks: int, num_cpu_blocks: int, num_gpu_reserved: int = 0) -> None:
         self.num_gpu_blocks = num_gpu_blocks
         self.num_cpu_blocks = num_cpu_blocks
 
-        self.allocator = LogicalAllocator(num_cpu_blocks, num_gpu_blocks)
+        self.allocator = LogicalAllocator(num_cpu_blocks, num_gpu_blocks, num_gpu_reserved)
 
         self.block_tables: Dict[int, BlockTable] = {}
 
diff --git a/lmdeploy/pytorch/paging/block_manager/window_block_manager.py b/lmdeploy/pytorch/paging/block_manager/window_block_manager.py
@@ -29,8 +29,8 @@ class WindowBlockManager(DefaultBlockManager):
         num_cpu_blocks (int): number of cpu blocks.
     """
 
-    def __init__(self, num_gpu_blocks: int, num_cpu_blocks: int, window_size: int):
-        super().__init__(num_gpu_blocks, num_cpu_blocks)
+    def __init__(self, num_gpu_blocks: int, num_cpu_blocks: int, window_size: int, num_gpu_reserved: int = 0):
+        super().__init__(num_gpu_blocks, num_cpu_blocks, num_gpu_reserved)
         assert window_size > 0, ('expect window size > 0, '
                                  f'but get window_size = {window_size}')
         self.window_size = window_size
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
@@ -12,7 +12,7 @@
 from ..messages import MessageStatus, SchedulerSequence, SchedulerSession, SequenceManager, SequenceMeta
 from .block_manager import build_block_manager
 from .block_trie import BlockTrie
-from .state_manager import StateManager
+from .state_manager import build_state_manager
 
 logger = get_logger('lmdeploy')
 
@@ -52,7 +52,7 @@ def __init__(
 
         self.block_manager = build_block_manager(cache_config)
         self.block_trie = BlockTrie(self.cache_config, self.block_manager)
-        self.state_manager = StateManager(self.cache_config.num_state_caches)
+        self.state_manager = build_state_manager(self.cache_config)
         self.is_ssm = len(self.cache_config.states_shapes) > 0
 
         self.eviction_helper = self.build_eviction_helper(self.scheduler_config.eviction_type)
diff --git a/lmdeploy/pytorch/paging/state_manager.py b/lmdeploy/pytorch/paging/state_manager.py
@@ -1,15 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 
+from lmdeploy.pytorch.config import CacheConfig
 from lmdeploy.pytorch.messages import SchedulerSequence
 
 
 class StateAllocator:
     """State allocator."""
 
-    def __init__(self, num_states: int):
+    def __init__(self, num_states: int, offset: int = 0):
         self.num_states = num_states
-        self._free_states = np.arange(num_states, dtype=np.int64)
+        self._free_states = np.arange(offset, offset + num_states, dtype=np.int64)
         self._free_count = num_states
 
     def allocate(self):
@@ -33,10 +34,10 @@ def get_num_free(self):
 
 class StateManager:
 
-    def __init__(self, num_states: int):
+    def __init__(self, num_states: int, num_reserved: int = 0):
         if num_states is None:
             num_states = 1
-        self.allocator = StateAllocator(num_states)
+        self.allocator = StateAllocator(num_states, offset=num_reserved)
 
     def is_allocated(self, seq: SchedulerSequence):
         """Check if a sequence is allocated."""
@@ -58,3 +59,11 @@ def free(self, seq: SchedulerSequence):
     def get_num_free(self):
         """Get num free."""
         return self.allocator.get_num_free()
+
+
+def build_state_manager(cache_config: CacheConfig) -> StateManager:
+    """Build state manager."""
+    num_states = cache_config.num_state_caches
+    # state is different from block, we always reserve one state for system use
+    num_reserved = 1
+    return StateManager(num_states, num_reserved)