fix

QiJune · QiJune · commit 0640458f1b72 · 2025-08-11T13:57:13.000+08:00
Signed-off-by: junq &lt;22017000+QiJune@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_model_engine.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_model_engine.py
@@ -66,6 +66,7 @@ def __init__(self, engine: "PyTorchModelEngine"):
         self.padding_enabled = config.cuda_graph_padding_enabled
         self.supported_batch_sizes = engine._cuda_graph_batch_sizes
         self.max_supported_batch_size = engine._max_cuda_graph_batch_size
+        self.max_num_tokens = engine.max_num_tokens
 
         # Low-level state, storing resources per batch size
         self.graphs: Dict[int, torch.cuda.CUDAGraph] = {}
@@ -106,8 +107,12 @@ def execute(self, batch: ScheduledRequests, inputs: Dict[str, Any],
 
         return self._run_graph(batch_size, inputs)
 
-    def _capture_graph(self, batch_size: int, forward_fn: Callable,
-                       initial_inputs: Dict[str, Any]):
+    def _capture_graph(self,
+                       batch_size: int,
+                       forward_fn: Callable,
+                       initial_inputs: Dict[str, Any],
+                       gather_ids: torch.Tensor,
+                       gather_context_logits: bool = False):
         """Captures the forward pass for a given batch size."""
         engine = self._get_engine()
 
@@ -117,13 +122,13 @@ def _capture_graph(self, batch_size: int, forward_fn: Callable,
 
         static_tensors = {
             "input_ids":
-            torch.ones((batch_size * max_tokens_per_req, ),
+            torch.ones((self.max_num_tokens, ),
                        device="cuda",
                        dtype=torch.int32),
             "position_ids":
             torch.zeros((
                 1,
-                batch_size * max_tokens_per_req,
+                self.max_num_tokens,
             ),
                         device="cuda",
                         dtype=torch.int32),
@@ -144,10 +149,11 @@ def _capture_graph(self, batch_size: int, forward_fn: Callable,
         graph = torch.cuda.CUDAGraph()
         with capturing_cuda_graph_context():
             for _ in range(self.WARMUP_STEPS):
-                forward_fn(capture_inputs)
+                forward_fn(capture_inputs, gather_ids, gather_context_logits)
 
             with torch.cuda.graph(graph, pool=self.memory_pool):
-                output = forward_fn(capture_inputs)
+                output = forward_fn(capture_inputs, gather_ids,
+                                    gather_context_logits)
 
         self.graphs[batch_size] = graph
         self.graph_outputs[batch_size] = make_weak_ref(output)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -2018,6 +2018,8 @@ def forward(
             graph_output = self.cuda_graph_model_engine.execute(
                 batch=padded_requests,
                 inputs=inputs,
+                gather_ids=gather_ids,
+                gather_context_logits=gather_context_logits,
                 forward_fn=self._forward_step)
 
             if graph_output is not None: