[TRTLLM-6452][feat]: Two-model engine KV cache reuse support (#6133)

ziyixiong-nv · web-flow · commit 66030ef8156f · 2025-07-19T13:17:15.000+08:00
Signed-off-by: ziyixiong-nv &lt;fxiong@nvidia.com&gt;
Signed-off-by: ziyixiong-nv &lt;219238287+ziyixiong-nv@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -826,6 +826,7 @@ class GenericLlmRequest
         mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT
                                                                      : LlmRequestState::kCONTEXT_INIT;
         mContextCurrentPosition = 0;
+        mPrepopulatedPromptLen = 0;
         mContextChunkSize = mPromptLen;
         mSeqSlot.reset();
     }
@@ -1564,7 +1565,9 @@ class GenericLlmRequest
     /// Returns whether the position is at the beginning of the context.
     [[nodiscard]] bool isFirstContextChunk() const noexcept
     {
-        return mContextCurrentPosition == 0;
+        // The number of cached token is encountered in mContextCurrentPosition,
+        // so the start position of the context is mPrepopulatedPromptLen.
+        return mContextCurrentPosition == mPrepopulatedPromptLen;
     }
 
     /// Move the cursor forward one chunk. When not chunked, move forward to the end of the context.
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -258,12 +258,6 @@ def __init__(self,
             ResourceManagerType.KV_CACHE_MANAGER)
         self.enable_kv_cache_events = self.kv_cache_manager is not None and self.kv_cache_manager.event_buffer_max_size > 0
 
-        if self.draft_model_engine is not None and self.kv_cache_manager is not None:
-            if self.kv_cache_manager.enable_block_reuse:
-                raise NotImplementedError(
-                    "Draft model engine + KV cache reuse is not supported yet. "
-                    "This will be fixed in the near future!")
-
         self.max_input_len = max_input_len
         # _executor_loop private data
         self.max_num_active_requests = model_engine.get_max_num_sequences()
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -162,21 +162,6 @@ def _mangle_executor_config(executor_config: ExecutorConfig):
             )
             executor_config.kv_cache_config.enable_block_reuse = False
 
-    spec_config = executor_config.speculative_config
-    if spec_config is not None and spec_config.spec_dec_mode.has_draft_model():
-        # The draft and target models have different KV cache managers to support
-        # different head sizes, dtypes, etc in the generic case.
-        # However, this line will set context_current_position > 0 if there are
-        # cached blocks: https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/resource_manager.py#L310.
-        # It actually mutates the LLM request! As a result, when we try to allocate KV cache
-        # pages for the draft model, is_first_context_chunk returns False and
-        # no pages are allocated.
-        # We need to refactor LLMRequest to fix this. Disable block reuse for now.
-        logger.warning(
-            f"Disabling block reuse for speculation algorithm {spec_config.spec_dec_mode}"
-        )
-        executor_config.kv_cache_config.enable_block_reuse = False
-
     if pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION" and executor_config.enable_chunked_context:
         logger.warning(
             f"Disabling chunked context for {pytorch_backend_config.attn_backend} backend"
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -57,6 +57,8 @@ l0_b200:
   - unittest/_torch/modeling -k "modeling_mixtral"
   - unittest/_torch/modeling -k "modeling_deepseek"
   - unittest/_torch/auto_deploy/unit/singlegpu
+  - unittest/_torch/speculative/test_eagle3.py
+  - unittest/_torch/speculative/test_kv_cache_reuse.py
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -18,6 +18,8 @@
     [
         [True, "TRTLLM", True, False, False],
         [False, "TRTLLM", True, False, False],
+        [True, "TRTLLM", True, True, False],
+        [False, "TRTLLM", True, True, False],
         [True, "FLASHINFER", True, False, False],
         [False, "FLASHINFER", True, False, False],
         [False, "TRTLLM", False, True, True],
diff --git a/tests/unittest/_torch/speculative/test_kv_cache_reuse.py b/tests/unittest/_torch/speculative/test_kv_cache_reuse.py
@@ -0,0 +1,81 @@
+import os
+import sys
+import unittest
+
+import pytest
+import torch
+from utils.llm_data import llm_models_root
+
+from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm.llmapi import (CudaGraphConfig, EagleDecodingConfig,
+                                 KvCacheConfig)
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+
+@pytest.mark.parametrize("use_cuda_graph,attn_backend", [
+    [True, "TRTLLM"],
+    [False, "TRTLLM"],
+])
+@pytest.mark.high_cuda_memory
+def test_kv_cache_reuse(use_cuda_graph: bool, attn_backend: str):
+    # Eagle3 one model works with overlap scheduler and block reuse.
+    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+    if total_mem_gb < 35:
+        pytest.skip("Not enough memory to load target + draft model")
+
+    models_path = llm_models_root()
+    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
+    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
+
+    # bs > 1 gives non-deterministic when doing IFB. There are slight chances
+    # that ref and spec does not match 100%
+    max_batch_size = 1
+    max_draft_len = 4
+    kv_cache_config = KvCacheConfig(enable_block_reuse=True,
+                                    free_gpu_memory_fraction=0.5)
+    cuda_graph_config = CudaGraphConfig(
+        batch_sizes=[1]) if use_cuda_graph else None
+
+    llm_common_config = dict(
+        model=target_model_dir,
+        attn_backend=attn_backend,
+        disable_overlap_scheduler=True,
+        cuda_graph_config=cuda_graph_config,
+        max_batch_size=max_batch_size,
+        kv_cache_config=kv_cache_config,
+        # This max_seq_len is larger than the one specified
+        # in the llama 3 8B eagle's config. We want to make sure
+        # that the draft model won't go above its max in warmup
+        # in this test.
+        max_seq_len=8192,
+    )
+
+    spec_config = EagleDecodingConfig(
+        max_draft_len=max_draft_len,
+        speculative_model_dir=eagle_model_dir,
+        eagle3_one_model=False,
+    )
+
+    llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
+
+    # Output tests
+    prompt = "The future of AI is"
+
+    sampling_params = SamplingParams(max_tokens=10, temperature=0)
+
+    # First run without KV cache
+    results = llm_spec.generate(prompt, sampling_params)
+    generated_text = results.outputs[0].text
+
+    # Second run with KV cache
+    results_kv_cache = llm_spec.generate(prompt, sampling_params)
+    generated_text_kv_cache = results_kv_cache.outputs[0].text
+
+    llm_spec.shutdown()
+
+    assert generated_text == generated_text_kv_cache
+
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -826,6 +826,7 @@ class GenericLlmRequest`
`826`	`826`	`mState = mEncoderTokens.has_value() \|\| mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT`
`827`	`827`	`: LlmRequestState::kCONTEXT_INIT;`
`828`	`828`	`mContextCurrentPosition = 0;`
	`829`	`+ mPrepopulatedPromptLen = 0;`
`829`	`830`	`mContextChunkSize = mPromptLen;`
`830`	`831`	`mSeqSlot.reset();`
`831`	`832`	`}`
`@@ -1564,7 +1565,9 @@ class GenericLlmRequest`
`1564`	`1565`	`/// Returns whether the position is at the beginning of the context.`
`1565`	`1566`	`[[nodiscard]] bool isFirstContextChunk() const noexcept`
`1566`	`1567`	`{`
`1567`		`- return mContextCurrentPosition == 0;`
	`1568`	`+ // The number of cached token is encountered in mContextCurrentPosition,`
	`1569`	`+ // so the start position of the context is mPrepopulatedPromptLen.`
	`1570`	`+ return mContextCurrentPosition == mPrepopulatedPromptLen;`
`1568`	`1571`	`}`
`1569`	`1572`
`1570`	`1573`	`/// Move the cursor forward one chunk. When not chunked, move forward to the end of the context.`
Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,8 @@`
`18`	`18`	`[`
`19`	`19`	`[True, "TRTLLM", True, False, False],`
`20`	`20`	`[False, "TRTLLM", True, False, False],`
	`21`	`+ [True, "TRTLLM", True, True, False],`
	`22`	`+ [False, "TRTLLM", True, True, False],`
`21`	`23`	`[True, "FLASHINFER", True, False, False],`
`22`	`24`	`[False, "FLASHINFER", True, False, False],`
`23`	`25`	`[False, "TRTLLM", False, True, True],`