vllm-project · MatthewBonanni · Dec 5, 2025 · Dec 6, 2025 · zhuohan123 · Dec 7, 2025
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -205,6 +205,10 @@ class ForwardContext:
 
     ubatch_slices: UBatchSlices | None = None
 
+    # set dynamically for each forward pass
+    # True during memory profiling, False otherwise
+    is_memory_profile: bool = False
+
     def __post_init__(self):
         assert self.cudagraph_runtime_mode.valid_runtime_modes(), (
             f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}"
@@ -235,6 +239,7 @@ def create_forward_context(
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
     batch_descriptor: BatchDescriptor | None = None,
     ubatch_slices: UBatchSlices | None = None,
+    is_memory_profile: bool = False,
 ):
     return ForwardContext(
         no_compile_layers=vllm_config.compilation_config.static_forward_context,
@@ -244,6 +249,7 @@ def create_forward_context(
         cudagraph_runtime_mode=cudagraph_runtime_mode,
         batch_descriptor=batch_descriptor,
         ubatch_slices=ubatch_slices,
+        is_memory_profile=is_memory_profile,
     )
 
 
@@ -272,6 +278,7 @@ def set_forward_context(
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
     batch_descriptor: BatchDescriptor | None = None,
     ubatch_slices: UBatchSlices | None = None,
+    is_memory_profile: bool = False,
 ):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
@@ -317,6 +324,7 @@ def set_forward_context(
         cudagraph_runtime_mode,
         batch_descriptor,
         ubatch_slices,
+        is_memory_profile,
     )
 
     try:

@@ -209,7 +209,9 @@
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config.compilation import CUDAGraphMode
 from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -1917,18 +1919,23 @@ def forward(
             )
 
         if attn_metadata is None:
-            # During the profile run try to simulate to worse case output size
-            # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
-            # since this can be large
-            _ = torch.empty(
-                (
-                    self.chunked_prefill_workspace_size,
-                    self.num_heads,
-                    self.qk_nope_head_dim + self.v_head_dim,
-                ),
-                device=k_c_normed.device,
-                dtype=k_c_normed.dtype,
-            )
+            # During the profile run or cudagraph capture try to simulate worst case
+            # output size for `self.kv_b_proj(kv_c_normed)` in
+            # `_compute_prefill_context` since this can be large
+            forward_ctx = get_forward_context()
+            if (
+                forward_ctx.is_memory_profile
+                or forward_ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE
+            ):
+                _ = torch.empty(
+                    (
+                        self.chunked_prefill_workspace_size,
+                        self.num_heads,
+                        self.qk_nope_head_dim + self.v_head_dim,
+                    ),
+                    device=k_c_normed.device,
+                    dtype=k_c_normed.dtype,
+                )
 
             # The zero fill is required when used with DP + EP
             # to ensure all ranks within a DP group compute the