prefill_max_num_batched_tokens optimization

Ther-LF · Ther-LF · commit 59947acf7258 · 2025-11-04T14:58:25.000Z
Signed-off-by: Ther-LF &lt;2639852836@qq.com&gt;
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -5,7 +5,7 @@
 from dataclasses import InitVar, field
 from typing import Any, Literal
 
-from pydantic import Field, SkipValidation, model_validator
+from pydantic import SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
@@ -37,10 +37,10 @@ class SchedulerConfig:
     This config has no static default. If left unspecified by the user, it will
     be set in `EngineArgs.create_engine_config` based on the usage context."""
 
-    prefill_max_num_batched_tokens: int = Field(init=False)
-    """Prefill maximum number of tokens to be processed in a single iteration.
-
-    This config is used when there are no decoding requests."""
+    prefill_max_num_batched_tokens: int | None = None
+    """Maximum number of tokens to be processed in a single iteration when there
+    are no decode requests. If not set (None), defaults to max_num_batched_tokens.
+    Must satisfy: prefill_max_num_batched_tokens >= max_num_batched_tokens."""
 
     max_num_seqs: SkipValidation[int] = None  # type: ignore
     """Maximum number of sequences to be processed in a single iteration.
@@ -80,11 +80,6 @@ class SchedulerConfig:
     """If True, prefill requests can be chunked based
     on the remaining max_num_batched_tokens."""
 
-    enable_hybrid_chunked_prefill: bool = False
-    """If True, prefill requests will only be chunked when there are decode 
-    requests present, otherwise they will proceed with normal prefill 
-    computation to increase throughput."""
-
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
@@ -183,9 +178,6 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 " prefix caching; disabling both."
             )
 
-        self.prefill_max_num_batched_tokens = max(
-            self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS
-        )
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
                 self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
@@ -203,30 +195,23 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                     self.max_num_batched_tokens,
                     POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
-                self.prefill_max_num_batched_tokens = max(
-                    self.prefill_max_num_batched_tokens,
-                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
             if self.is_multimodal_model:
                 # The value needs to be at least the number of multimodal tokens
                 self.max_num_batched_tokens = max(
                     self.max_num_batched_tokens,
                     MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
-                self.prefill_max_num_batched_tokens = max(
-                    self.prefill_max_num_batched_tokens,
-                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
             # When using default settings,
             # Ensure max_num_batched_tokens does not exceed model limit.
             # Some models (e.g., Whisper) have embeddings tied to max length.
             self.max_num_batched_tokens = min(
                 self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens
             )
-            self.prefill_max_num_batched_tokens = min(
-                self.max_num_seqs * self.max_model_len,
-                self.prefill_max_num_batched_tokens,
-            )
+
+        # Initialize prefill_max_num_batched_tokens based on user input
+        if self.prefill_max_num_batched_tokens is None:
+            # Default to max_num_batched_tokens
+            self.prefill_max_num_batched_tokens = self.max_num_batched_tokens
         self.max_num_encoder_input_tokens = self.max_num_batched_tokens
         self.encoder_cache_size = self.max_num_batched_tokens
 
@@ -318,12 +303,13 @@ def _verify_args(self) -> Self:
                 f"max_num_partial_prefills ({self.max_num_partial_prefills})."
             )
 
-        if self.enable_hybrid_chunked_prefill and not self.chunked_prefill_enabled:
+        # Validate prefill_max_num_batched_tokens
+        if self.prefill_max_num_batched_tokens < self.max_num_batched_tokens:
             raise ValueError(
-                "Hybrid chunked prefill can only be enabled when chunked "
-                "prefill is enabled. Please set --enable-chunked-prefill=True "
-                "or disable hybrid chunked prefill by setting "
-                "--enable-hybrid-chunked-prefill=False."
+                f"prefill_max_num_batched_tokens "
+                f"({self.prefill_max_num_batched_tokens}) must be greater "
+                f"than or equal to max_num_batched_tokens "
+                f"({self.max_num_batched_tokens})."
             )
 
         return self
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -424,6 +424,9 @@ class EngineArgs:
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
     max_num_batched_tokens: int | None = SchedulerConfig.max_num_batched_tokens
+    prefill_max_num_batched_tokens: int | None = (
+        SchedulerConfig.prefill_max_num_batched_tokens
+    )
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
     long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
@@ -483,7 +486,6 @@ class EngineArgs:
     ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
 
     enable_chunked_prefill: bool | None = SchedulerConfig.enable_chunked_prefill
-    enable_hybrid_chunked_prefill: bool = SchedulerConfig.enable_hybrid_chunked_prefill
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
     disable_hybrid_kv_cache_manager: bool = (
@@ -1005,6 +1007,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         scheduler_group.add_argument(
             "--max-num-batched-tokens", **scheduler_kwargs["max_num_batched_tokens"]
         )
+        scheduler_group.add_argument(
+            "--prefill-max-num-batched-tokens",
+            **scheduler_kwargs["prefill_max_num_batched_tokens"],
+        )
         scheduler_group.add_argument(
             "--max-num-seqs", **scheduler_kwargs["max_num_seqs"]
         )
@@ -1030,10 +1036,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         scheduler_group.add_argument(
             "--enable-chunked-prefill", **scheduler_kwargs["enable_chunked_prefill"]
         )
-        scheduler_group.add_argument(
-            "--enable-hybrid-chunked-prefill",
-            **scheduler_kwargs["enable_hybrid_chunked_prefill"],
-        )
         scheduler_group.add_argument(
             "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]
         )
@@ -1578,11 +1580,11 @@ def create_engine_config(
         scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
             max_num_batched_tokens=self.max_num_batched_tokens,
+            prefill_max_num_batched_tokens=self.prefill_max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
             num_lookahead_slots=num_lookahead_slots,
             enable_chunked_prefill=self.enable_chunked_prefill,
-            enable_hybrid_chunked_prefill=self.enable_hybrid_chunked_prefill,
             disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,
             is_encoder_decoder=model_config.is_encoder_decoder,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -224,13 +224,15 @@ def schedule(self) -> SchedulerOutput:
         num_scheduled_tokens: dict[str, int] = {}
 
         token_budget = self.max_num_scheduled_tokens
-        # Check if there are any requests in the decode phase in the running queue
-        # when hybrid chunked prefill is enabled.
-        has_decode_requests = True
-        if self.scheduler_config.enable_hybrid_chunked_prefill:
-            has_decode_requests = self._has_decode_reqs
-            if not has_decode_requests:
-                token_budget = self.prefill_max_num_scheduled_tokens
+        # Check if there are any requests in the decode phase in the running queue.
+        # If no decode requests and prefill_max_num_batched_tokens is larger,
+        # use the larger budget for better throughput.
+        has_decode_requests = self._has_decode_reqs
+        if (
+            not has_decode_requests
+            and self.prefill_max_num_scheduled_tokens > self.max_num_scheduled_tokens
+        ):
+            token_budget = self.prefill_max_num_scheduled_tokens
 
         # Encoder-related.
         scheduled_encoder_inputs: dict[str, list[int]] = {}
@@ -499,7 +501,6 @@ def schedule(self) -> SchedulerOutput:
                     # pooling requests to be chunked
                     if (
                         not self.scheduler_config.chunked_prefill_enabled
-                        and not self.scheduler_config.enable_hybrid_chunked_prefill
                         and num_new_tokens > token_budget
                     ):
                         self.waiting.pop_request()
@@ -626,8 +627,8 @@ def schedule(self) -> SchedulerOutput:
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         if (
-            self.scheduler_config.enable_hybrid_chunked_prefill
-            and not has_decode_requests
+            not has_decode_requests
+            and self.prefill_max_num_scheduled_tokens > self.max_num_scheduled_tokens
         ):
             assert total_num_scheduled_tokens <= self.prefill_max_num_scheduled_tokens
         else:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -249,10 +249,12 @@ def __init__(
         self.is_multimodal_pruning_enabled = False
         self.max_model_len = model_config.max_model_len
         self.dcp_world_size = self.parallel_config.decode_context_parallel_size
-        if scheduler_config.enable_hybrid_chunked_prefill:
-            self.max_num_tokens = scheduler_config.prefill_max_num_batched_tokens
-        else:
-            self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        # Use the larger of max_num_batched_tokens and prefill_max_num_batched_tokens
+        # for memory profiling to ensure we allocate enough memory
+        self.max_num_tokens = max(
+            scheduler_config.max_num_batched_tokens,
+            scheduler_config.prefill_max_num_batched_tokens,
+        )
         self.max_num_reqs = scheduler_config.max_num_seqs
 
         # Broadcast PP output for external_launcher (torchrun)