We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent fdfacbc commit 2eb3030Copy full SHA for 2eb3030
tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -750,18 +750,6 @@ def disable_optimization(backend: Backend):
750
if bs > self.batch_size:
751
# skip batch size larger than self.batch_size
752
continue
753
- with release_batch(get_cuda_graph_warmup_request(bs)) as batch:
754
- if batch is None:
755
- # No KV cache space!
756
- return
757
- logger.info(
758
- f"Run generation only CUDA graph warmup for batch size={bs}"
759
- )
760
- self.cuda_graph_model_engine.execute(
761
- batch,
762
- new_tensors_device=None,
763
- resource_manager=resource_manager)
764
- torch.cuda.synchronize()
765
766
for draft_len in draft_lengths:
767
with release_batch(
0 commit comments