[BugFix] Eagerly abort final-step requests

njhill · njhill · commit bf566524ebf7 · 2025-12-03T10:25:54.000-08:00
Signed-off-by: Nick Hill &lt;nhill@redhat.com&gt;
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -204,6 +204,8 @@ def __init__(
         )
         self.async_scheduling = vllm_config.scheduler_config.async_scheduling
 
+        self.aborts_queue = queue.Queue[list[str]]()
+
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
         freeze_gc_heap()
@@ -347,6 +349,8 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
             if model_output is None:
                 model_output = self.model_executor.sample_tokens(grammar_output)
 
+        # Ensure we handle aborts which happened during the model execution.
+        self._process_aborts_queue()
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, model_output
         )
@@ -440,6 +444,8 @@ def step_with_batch_queue(
         with self.log_error_detail(scheduler_output):
             model_output = future.result()
 
+        # Ensure we handle aborts which happened during the model execution.
+        self._process_aborts_queue()
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, model_output
         )
@@ -458,6 +464,10 @@ def step_with_batch_queue(
 
         return engine_core_outputs, model_executed
 
+    def _process_aborts_queue(self):
+        while not self.aborts_queue.empty():
+            self.abort_requests(self.aborts_queue.get_nowait())
+
     def shutdown(self):
         self.structured_output_manager.clear_backend()
         if self.model_executor:
@@ -871,9 +881,13 @@ def _process_input_queue(self):
             and not self.scheduler.has_requests()
             and not self.batch_queue
         ):
-            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
-                logger.debug("EngineCore waiting for work.")
-                waited = True
+            if self.input_queue.empty():
+                # Drain aborts queue; all aborts are also processed via input_queue.
+                with self.aborts_queue.mutex:
+                    self.aborts_queue.queue.clear()
+                if logger.isEnabledFor(DEBUG):
+                    logger.debug("EngineCore waiting for work.")
+                    waited = True
             req = self.input_queue.get()
             self._handle_client_request(*req)
 
@@ -1027,6 +1041,10 @@ def process_input_sockets(
                     else:
                         request = generic_decoder.decode(data_frames)
 
+                        if request_type == EngineCoreRequestType.ABORT:
+                            # Aborts are added to *both* queues.
+                            self.aborts_queue.put_nowait(request)
+
                     # Push to input queue for core busy loop.
                     self.input_queue.put_nowait((request_type, request))