build trie in prefill and add hit rate (#4184)

RunningLeon · web-flow · commit 843684872013 · 2025-12-08T13:12:00.000+08:00
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -546,6 +546,7 @@ class ScheduleMetrics:
     active_blocks: int = 0
     cached_blocks: int = 0
     free_blocks: int = 0
+    prefix_cache_hit_rate: float = 0
 
 
 @dataclass
diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
@@ -118,7 +118,9 @@ def log(self):
                    f'Unfinished: {scheduler_stats.num_total_reqs-scheduler_stats.num_finished_reqs} reqs, '
                    f'Running: {scheduler_stats.num_running_reqs} reqs, '
                    f'Waiting: {scheduler_stats.num_waiting_reqs} reqs, '
-                   f'GPU KV cache usage: {scheduler_stats.gpu_cache_usage * 100 :.1f}%')
+                   f'GPU KV cache usage: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, '
+                   f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%')
+
         print(log_msg, flush=True)
         self.log_spec_msg()
 
diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
@@ -20,13 +20,15 @@ class SchedulerStats:
         num_running_reqs: Currently executing requests.
         num_waiting_reqs: Requests queued waiting for execution.
         gpu_cache_usage: Fraction of GPU KV blocks utilized (0.0 to 1.0).
+        prefix_cache_hit_rate: Prefix caching hit rate.
     """
 
     num_total_reqs: int = 0
     num_finished_reqs: int = 0
     num_running_reqs: int = 0
     num_waiting_reqs: int = 0
     gpu_cache_usage: float = 0.0
+    prefix_cache_hit_rate: float = 0.0
 
     def __repr__(self):
         return ('SchedulerStats(\n'
@@ -35,12 +37,14 @@ def __repr__(self):
                 f'  num_running_reqs={self.num_running_reqs},\n'
                 f'  num_waiting_reqs={self.num_waiting_reqs},\n'
                 f'  gpu_cache_usage={self.gpu_cache_usage:.6f},\n'
+                f'  prefix_cache_hit_rate={self.prefix_cache_hit_rate:.6f},\n'
                 ')')
 
     def update_from_schedule_metrics(self, scheduled_metrics: ScheduleMetrics):
         self.num_running_reqs = scheduled_metrics.active_seqs
         self.num_waiting_reqs = scheduled_metrics.waiting_seqs
         self.gpu_cache_usage = 1.0 - (scheduled_metrics.free_blocks / scheduled_metrics.total_blocks)
+        self.prefix_cache_hit_rate = scheduled_metrics.prefix_cache_hit_rate
 
 
 class RequestStats:
diff --git a/lmdeploy/pytorch/paging/block_trie.py b/lmdeploy/pytorch/paging/block_trie.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import heapq
+from dataclasses import dataclass
 from typing import Dict, Set
 
 import numpy as np
@@ -10,6 +11,20 @@
 from .block_manager import BaseBlockManager
 
 
+@dataclass
+class PrefixCacheStats:
+    """Prefix caching stats."""
+    num_query_tokens: int = 0
+    num_hit_tokens: int = 0
+
+    def reset(self):
+        self.num_query_tokens = 0
+        self.num_hit_tokens = 0
+
+    def hit_rate(self):
+        return 0.0 if self.num_query_tokens <= 0 else float(self.num_hit_tokens) / self.num_query_tokens
+
+
 class Node:
     """Node of block trie."""
 
@@ -54,6 +69,11 @@ def __init__(self, cache_config: CacheConfig, block_manager: BaseBlockManager):
         # caches with different adapter should not be shared.
         self._roots: Dict[str, Node] = dict()
         self.leaves: Set[Node] = set()
+        self.stats = PrefixCacheStats()
+
+    def hit_rate(self):
+        """Get hit rate."""
+        return self.stats.hit_rate()
 
     def get_root(self, adapter_name: str):
         """Get root by adapter name."""
@@ -73,6 +93,7 @@ def match(self, seq: SchedulerSequence):
         curr: Node = getattr(logical_blocks, 'last_shared_node', None)
         if curr is None:
             curr = self.get_root(seq.adapter_name)
+        init_num_matched = curr.num_matched
         num_matched = curr.num_matched
 
         def __match_success(node: Node):
@@ -101,6 +122,10 @@ def __match_success(node: Node):
             seq.logical_blocks.append(matched_blocks)
             seq.set_step(num_matched)
 
+        # record prefix hit
+        self.stats.num_query_tokens += seq.num_all_ids - init_num_matched
+        self.stats.num_hit_tokens += num_matched - init_num_matched
+
         seq.logical_blocks.last_shared_node = curr
 
     def allocate(self, seq: SchedulerSequence):
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
@@ -235,6 +235,7 @@ def _reorder_waiting():
 
             # allocate session memory
             self.block_manager.allocate(seq, prealloc_size)
+            self.block_trie.allocate(seq)
             if self.is_ssm:
                 self.state_manager.allocate(seq)
             _to_running(seq)
@@ -451,4 +452,5 @@ def schedule_metrics(self):
             waiting_seqs=self.num_waiting() + self.num_running(),
             total_blocks=self.block_manager.num_gpu_blocks,
             free_blocks=self.block_manager.get_num_free_gpu_blocks(),
+            prefix_cache_hit_rate=self.block_trie.hit_rate(),
         )