[fix]: Skip prompt length checking for generation only requests (#6146)

LinPoly · web-flow · commit 69e9f6d48944 · 2025-07-19T21:26:37.000+08:00
Signed-off-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/disaggregated_params.py b/tensorrt_llm/disaggregated_params.py
@@ -6,10 +6,10 @@
 
 @dataclass(slots=True, kw_only=True)
 class DisaggregatedParams:
-    """Disaggregated seving parameters.
+    """Disaggregated serving parameters.
 
     Args:
-        request_type (str): The type of request ("context_only" or "generation_only")
+        request_type (str): The type of request ("context_only" | "generation_only" | "context_and_generation")
         first_gen_tokens (List[int]): The first tokens of the generation request
         ctx_request_id (int): The context request id
         opaque_state(bytes): Any additional state needing to be exchanged between context and gen instances
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -334,9 +334,9 @@ def generate_async(
         # With pytorch backend, py_executor has logic to handle max_tokens of 1,
         # so set to 1 to avoid allocating unnecessary KV cache blocks for single request
         # TODO: Also support for trt backend
-        if (disaggregated_params is not None
-                and disaggregated_params.request_type == "context_only"
-                and not self._on_trt_backend):
+        is_ctx_only = disaggregated_params is not None and disaggregated_params.request_type == "context_only"
+        is_gen_only = disaggregated_params is not None and disaggregated_params.request_type == "generation_only"
+        if is_ctx_only and not self._on_trt_backend:
             sampling_params.max_tokens = 1
 
         inputs = prompt_inputs(inputs)
@@ -401,7 +401,8 @@ def generate_async(
         self._check_arguments(
             len(prompt_token_ids),
             len(query_token_ids) if query_token_ids is not None else 0,
-            sampling_params)
+            sampling_params,
+            is_gen_only=is_gen_only)
         if _postproc_params:
             _postproc_params.postproc_args.num_prompt_tokens = len(
                 prompt_token_ids)
@@ -529,7 +530,8 @@ def _prepare_sampling_params(
         return sampling_params
 
     def _check_arguments(self, prompt_len: int, query_len: int,
-                         sampling_params: SamplingParams) -> None:
+                         sampling_params: SamplingParams,
+                         is_gen_only: bool) -> None:
 
         if self.args.backend in ["pytorch", "_autodeploy"]:
             # TODO: remove these checks after PyTorch backend
@@ -543,11 +545,12 @@ def _check_arguments(self, prompt_len: int, query_len: int,
                     f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
                 )
             # Check prompt length and query length against max_num_tokens to filter illegal requests.
-            if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill:
+            # Skip check for gen-only requests
+            if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill and not is_gen_only:
                 max_num_tokens = self.args.max_num_tokens
                 if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens:
                     raise ValueError(
-                        f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) and max_tokens ({sampling_params.max_tokens}) should not exceed "
+                        f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) should not exceed "
                         f"max_num_tokens ({max_num_tokens})")
             return
 
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1357,6 +1357,15 @@ def set_runtime_knobs_from_build_config(self):
 
         return self
 
+    @model_validator(mode="after")
+    def validate_runtime_args(self):
+        if self.max_batch_size is not None and self.max_num_tokens is not None:
+            if self.max_batch_size > self.max_num_tokens:
+                logger.warning(
+                    f"max_batch_size [{self.max_batch_size}] should be less than or equal to max_num_tokens [{self.max_num_tokens}]"
+                )
+        return self
+
     @model_validator(mode="after")
     def validate_build_config_with_runtime_params(self):
         # Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,