InternLM
diff --git a/‎lmdeploy/cli/chat.py‎
Lines changed: 3 additions & 0 deletions b/‎lmdeploy/cli/chat.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lmdeploy/messages.py‎
Lines changed: 10 additions & 0 deletions b/‎lmdeploy/messages.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lmdeploy/serve/async_engine.py‎
Lines changed: 12 additions & 7 deletions b/‎lmdeploy/serve/async_engine.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎lmdeploy/turbomind/turbomind.py‎
Lines changed: 1 addition & 0 deletions b/‎lmdeploy/turbomind/turbomind.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/turbomind/engine/request.h‎
Lines changed: 10 additions & 9 deletions b/‎src/turbomind/engine/request.h‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎src/turbomind/models/llama/BlockTrie.cc‎
Lines changed: 37 additions & 36 deletions b/‎src/turbomind/models/llama/BlockTrie.cc‎
Lines changed: 37 additions & 36 deletions
diff --git a/‎src/turbomind/models/llama/BlockTrie.h‎
Lines changed: 35 additions & 14 deletions b/‎src/turbomind/models/llama/BlockTrie.h‎
Lines changed: 35 additions & 14 deletions
@@ -14,6 +14,9 @@ def input_prompt():
 
 def build_pipe(model_path, backend, **kwargs):
     engine_config = None
+    if kwargs.get('enable_prefix_caching', False):
+        print('interactive chat cannot be used when prefix caching is enabled')
+        exit(-1)
     if backend == 'turbomind':
         engine_config = TurbomindEngineConfig()
         for key, value in kwargs.items():
 
@@ -410,6 +410,7 @@ class ResponseType(enum.Enum):
     INPUT_LENGTH_ERROR = enum.auto()
     INTERNAL_ENGINE_ERROR = enum.auto()
     CANCEL = enum.auto()
+    PREFIX_CACHE_CONFLICT_INTERACTIVE_MODE = enum.auto()
 
 
 @dataclass
@@ -444,6 +445,15 @@ class Response:
     last_hidden_state: torch.Tensor = None
     index: int = 0
 
+    def __repr__(self):
+        logits = 'logits=None' if self.logits is None else f'logits.shape={self.logits.shape}\nlogits={self.logits}'
+        hidden_state = (
+            'last_hidden_state=None' if self.last_hidden_state is None else
+            f'last_hidden_state.shape={self.last_hidden_state.shape}\nlast_hidden_state={self.last_hidden_state}')
+        s = (f'text={self.text}\ngenerate_token_len={self.generate_token_len}\nfinish_reason="{self.finish_reason}"\n'
+             f'token_ids={self.token_ids}\nlog_probs={self.logprobs}\n{logits}\n{hidden_state}')
+        return s
+
 
 # modified from https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/__init__.py
 class EventType(enum.IntEnum):
 
@@ -690,11 +690,6 @@ async def generate(
             gen_config.stop_token_ids = self.stop_words
         gen_config.update_from_hf_gen_cfg(self.hf_gen_cfg, self.tokenizer.eos_token_id)
         if not gen_config.do_sample:
-            logger.warning(f'GenerationConfig: {gen_config}')
-            logger.warning('Since v0.6.0, lmdeploy add `do_sample` in '
-                           'GenerationConfig. It defaults to False, meaning greedy '
-                           'decoding. Please set `do_sample=True` if sampling '
-                           ' decoding is needed')
             # greedy decode
             gen_config.top_k = 1
             # avoid unnecessary process
@@ -704,8 +699,7 @@ async def generate(
         elif gen_config.random_seed is None and sequence_start:
             gen_config.random_seed = random.getrandbits(64)
         if gen_config.n > 1:
-            logger.ERROR(f"n({gen_config.n}) > 1 hasn't been supported yet. "
-                         f'Fallback to 1')
+            logger.warning(f'n({gen_config.n}) > 1 hasn\'t been supported yet. Fallback to 1')
             gen_config.n = 1
         if messages:
             prompt = messages
@@ -742,6 +736,17 @@ async def generate(
                 if sequence_end is True and sequence_start is False:
                     await self.end_session(session_id)
                 return
+        if self.backend_config.enable_prefix_caching and (gen_config.output_last_hidden_state == 'all'
+                                                          or gen_config.output_logits == 'all'):
+            errmsg = ('lmdeploy does not support outputting all token\'s logits or last_hidden_state '
+                      'when prefix caching is ON')
+            yield GenOut(response=errmsg,
+                         history_token_len=self.id2step[session_id],
+                         input_token_len=len(input_ids),
+                         generate_token_len=0,
+                         finish_reason='error',
+                         token_ids=[])
+            return
 
         def is_error(status):
             return status not in [ResponseType.SUCCESS, ResponseType.FINISH, ResponseType.CANCEL]
 
@@ -538,6 +538,7 @@ def __init__(self, tm_model: TurboMind, config: TurbomindModelConfig, cuda_strea
             6: ResponseType.INPUT_LENGTH_ERROR,
             7: ResponseType.FINISH,
             8: ResponseType.CANCEL,
+            9: ResponseType.PREFIX_CACHE_CONFLICT_INTERACTIVE_MODE,
             -1: ResponseType.INTERNAL_ENGINE_ERROR,
         }
 
 
@@ -140,15 +140,16 @@ struct Request {
 
     enum
     {
-        kOk       = 0,
-        kInvalid  = 1,  // Sequence not exist or both `start` & `stop` (instead of `end`) is set
-        kConflict = 2,  // Concurrent requests to the same sequence
-        kBusy     = 3,  // Sequence is already running
-        kInactive = 4,  // Sequence to `stop` is not active
-        kFail     = 5,  // Can't find sequence for `stop` request or internal error during inference
-        kTooLong  = 6,  // history + prompt > session_len,
-        kFinish   = 7,
-        kCancel   = 8,
+        kOk            = 0,
+        kInvalid       = 1,  // Sequence not exist or both `start` & `stop` (instead of `end`) is set
+        kConflict      = 2,  // Concurrent requests to the same sequence
+        kBusy          = 3,  // Sequence is already running
+        kInactive      = 4,  // Sequence to `stop` is not active
+        kFail          = 5,  // Can't find sequence for `stop` request or internal error during inference
+        kTooLong       = 6,  // history + prompt > session_len,
+        kFinish        = 7,
+        kCancel        = 8,
+        kInconsistency = 9,  // Inconsistent request parameters, e.g. prefix caching is not allowed in interactive mode
     };
 };
 
 
@@ -14,20 +14,23 @@ size_t hash(const std::vector<int>& vec)
     return seed;
 }
 
-BlockTrie::BlockTrie(size_t block_seq_len, std::shared_ptr<BlockManager> block_manager, bool enable_prefix_caching):
-    block_seq_len_(block_seq_len), block_manager_(block_manager), enable_prefix_caching_(enable_prefix_caching)
+BlockTrie::BlockTrie(size_t block_len, std::shared_ptr<BlockManager> block_manager):
+    block_seq_len_(block_len), block_manager_(block_manager)
 {
     root_ = std::make_shared<TrieNode>();
 }
 
-void BlockTrie::match(Sequence& seq)
+std::tuple<BlockIds, UniqueIds> BlockTrie::Match(const Sequence& seq)
 {
     BlockIds  matched_blocks;
     UniqueIds matched_unique_ids;
 
     std::shared_ptr<TrieNode> curr_node   = root_;
     int                       num_matched = 0;
 
+    // Warning: Do not use "<=" operator even when seq.prompt length is evenly
+    // divisible by block_seq_len_. This may produce an input_length of zero for
+    // the sequence, violating the precondition checked in LlamaBatch::Forward.
     while (num_matched + block_seq_len_ < seq.prompt.size()) {
         std::vector<int> curr_tokens(seq.prompt.begin() + num_matched,
                                      seq.prompt.begin() + num_matched + block_seq_len_);
@@ -40,44 +43,47 @@ void BlockTrie::match(Sequence& seq)
         }
 
         if (curr_tokens != it->second->tokens) {
+            TM_LOG_WARNING("hash key cache hit, but tokens are not the same");
             break;
         }
 
-        matched_blocks.push_back(it->second->block_id);
-        matched_unique_ids.push_back(it->second->block_unique_id);
+        matched_blocks.emplace_back(it->second->block_id);
+        matched_unique_ids.emplace_back(it->second->block_unique_id);
         curr_node = it->second;
         num_matched += block_seq_len_;
     }
-
-    if (matched_blocks.size() > 0) {
-        // add use count
-        block_manager_->Lock(matched_blocks);
-        block_manager_->Touch(matched_blocks);
-        // only consider no history blocks
-        seq.blocks.insert(seq.blocks.end(), matched_blocks.begin(), matched_blocks.end());
-        seq.block_unique_ids.insert(seq.block_unique_ids.end(), matched_unique_ids.begin(), matched_unique_ids.end());
-    }
+    return std::make_tuple(matched_blocks, matched_unique_ids);
 }
 
-void BlockTrie::cache(const Sequence& seq)
+std::tuple<BlockIds, UniqueIds> BlockTrie::Cache(const Sequence& seq, const std::vector<int>& tokens)
 {
-    std::shared_ptr<TrieNode> curr_node   = root_;
-    int                       num_matched = 0;
-    int                       idx         = 0;
-    BlockIds                  cached_blocks;
+    FT_CHECK(seq.status != Sequence::kCached);
+    FT_CHECK(tokens.size() <= seq.blocks.size() * block_seq_len_);
 
-    while (num_matched + block_seq_len_ <= seq.prompt.size()) {
-        std::vector<int> curr_tokens(seq.prompt.begin() + num_matched,
-                                     seq.prompt.begin() + num_matched + block_seq_len_);
-        size_t           hash_key = hash(curr_tokens);
+    std::shared_ptr<TrieNode> curr_node = root_;
+    int                       idx       = 0;
 
-        auto it = curr_node->children.find(hash_key);
+    BlockIds  cache_block_ids;
+    UniqueIds cache_block_unique_ids;
+
+    // We don't cache the last block of the sequence, since it might not be full
+    // TODO(lvhan): determine wether the last block is full or not. It is not trivial
+    // considering chunk prefill
+    for (int idx = 0; idx < (int)seq.blocks.size() - 1; ++idx) {
+        auto start = tokens.begin() + idx * block_seq_len_;
+        auto end   = start + block_seq_len_;
+
+        std::vector<int> curr_tokens(start, end);
+        // TODO(lvhan): add salt to ensure the hash security
+        size_t hash_key = hash(curr_tokens);
 
         int      block_id        = seq.blocks[idx];
         uint64_t block_unique_id = seq.block_unique_ids[idx];
 
+        auto it = curr_node->children.find(hash_key);
         if (it != curr_node->children.end()) {
             if (curr_tokens != it->second->tokens) {
+                TM_LOG_WARNING("[BlockTrie][cache] hash key cache hit, but tokens are not the same");
                 break;
             }
             curr_node                  = it->second;
@@ -91,38 +97,33 @@ void BlockTrie::cache(const Sequence& seq)
             node->tokens                   = curr_tokens;
             node->block_id                 = block_id;
             node->block_unique_id          = block_unique_id;
-            node->num_matched              = num_matched + block_seq_len_;
             curr_node->children[hash_key]  = node;
             curr_node                      = node;
         }
-
-        cached_blocks.push_back(curr_node->block_id);
-        num_matched += block_seq_len_;
-        idx++;
+        cache_block_ids.emplace_back(block_id);
+        cache_block_unique_ids.emplace_back(block_unique_id);
     }
 
-    block_manager_->Touch(cached_blocks);
+    return std::make_tuple(cache_block_ids, cache_block_unique_ids);
 }
 
-int BlockTrie::verify()
+void BlockTrie::Verify()
 {
-    return verify_traverse(root_);
+    DFS(root_);
 }
 
-int BlockTrie::verify_traverse(std::shared_ptr<TrieNode>& node)
+void BlockTrie::DFS(std::shared_ptr<TrieNode>& node)
 {
-    int valid_count = 1;
     for (auto it = node->children.begin(); it != node->children.end();) {
         if (block_manager_->unique_id(it->second->block_id) != it->second->block_unique_id) {
             // child invalid
             it = node->children.erase(it);
         }
         else {
-            valid_count += verify_traverse(it->second);
+            DFS(it->second);
             it++;
         }
     }
-    return valid_count;
 }
 
 }  // namespace turbomind
@@ -22,27 +22,48 @@ struct TrieNode {
 
 class BlockTrie {
 public:
-    explicit BlockTrie(size_t block_len_, std::shared_ptr<BlockManager> block_manager, bool enable_prefix_caching);
+    explicit BlockTrie(size_t block_len, std::shared_ptr<BlockManager> block_manager);
 
-    bool enabled()
-    {
-        return enable_prefix_caching_;
-    }
+    /**
+     * @brief Attempt to match cached key-value (KV) blocks for a given sequence.
+     *
+     * This function iterates the tokens of the sequence and attempts
+     * to match them with the cached KV blocks. If the max prefix match is found,
+     * it returns the IDs, unique IDs of the matched blocks.
+     *
+     * @param seq The sequence whose tokens are to be matched against the cached KV blocks.
+     * @return A tuple containing the following:
+     *         - BlockIds: A list of IDs of the matched blocks.
+     *         - UniqueIds: A list of unique IDs of the matched blocks.
+     *
+     * @note If no blocks are matched, all containers in the returned tuple will be empty.
+     */
+    std::tuple<BlockIds, UniqueIds> Match(const Sequence& seq);
 
-    // get cached blocks for sequence
-    void match(Sequence& seq);
+    /**
+     * @brief Cache the key-value (KV) blocks of a given sequence.
+     *
+     * This function caches the KV blocks of the specified sequence. Only valid blocks
+     * of a sequence whose status is NOT `Sequence::kCached` are considered
+     * to be cached
+     *
+     * @param seq The sequence whose KV blocks are to be cached.
+     * @param tokens The token list corresponding to the KV blocks
+     * @return A tuple containing the following:
+     *         - BlockIds: A list of IDs of the cached blocks.
+     *         - UniqueIds: A list of unique IDs of the cached blocks.
+     */
+    std::tuple<BlockIds, UniqueIds> Cache(const Sequence& seq, const std::vector<int>& tokens);
 
-    // cache computed blocks for sequence
-    void cache(const Sequence& seq);
-
-    // remove invalid nodes, return valid count
-    int verify();
+    /**
+     * @brief remove invalid nodes
+     */
+    void Verify();
 
 private:
-    int verify_traverse(std::shared_ptr<TrieNode>& node);
+    void DFS(std::shared_ptr<TrieNode>& node);
 
 private:
-    bool   enable_prefix_caching_;
     size_t block_seq_len_;
 
     std::shared_ptr<BlockManager> block_manager_;
Original file line number	Diff line number	Diff line change
`@@ -538,6 +538,7 @@ def __init__(self, tm_model: TurboMind, config: TurbomindModelConfig, cuda_strea`
`538`	`538`	`6: ResponseType.INPUT_LENGTH_ERROR,`
`539`	`539`	`7: ResponseType.FINISH,`
`540`	`540`	`8: ResponseType.CANCEL,`
	`541`	`+ 9: ResponseType.PREFIX_CACHE_CONFLICT_INTERACTIVE_MODE,`
`541`	`542`	`-1: ResponseType.INTERNAL_ENGINE_ERROR,`
`542`	`543`	`}`
`543`	`544`