6161from vllm .v1 .request import Request , RequestStatus
6262from vllm .v1 .serial_utils import MsgpackDecoder , MsgpackEncoder
6363from vllm .v1 .structured_output import StructuredOutputManager
64+ from vllm .v1 .utils import record_function_or_nullcontext
6465from vllm .version import __version__ as VLLM_VERSION
6566
6667logger = init_logger (__name__ )
@@ -315,17 +316,21 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
315316 # or finished and not yet removed from the batch.
316317 if not self .scheduler .has_requests ():
317318 return {}, False
318- scheduler_output = self .scheduler .schedule ()
319- future = self .model_executor .execute_model (scheduler_output , non_block = True )
320- grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
321- with self .log_error_detail (scheduler_output ):
322- model_output = future .result ()
323- if model_output is None :
324- model_output = self .model_executor .sample_tokens (grammar_output )
325-
326- engine_core_outputs = self .scheduler .update_from_output (
327- scheduler_output , model_output
328- )
319+ with record_function_or_nullcontext ("core step: schedule" ):
320+ scheduler_output = self .scheduler .schedule ()
321+
322+ with record_function_or_nullcontext ("core step: execute_model" ):
323+ future = self .model_executor .execute_model (scheduler_output , non_block = True )
324+ grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
325+ with self .log_error_detail (scheduler_output ):
326+ model_output = future .result ()
327+ if model_output is None :
328+ model_output = self .model_executor .sample_tokens (grammar_output )
329+
330+ with record_function_or_nullcontext ("core step: update_from_output" ):
331+ engine_core_outputs = self .scheduler .update_from_output (
332+ scheduler_output , model_output
333+ )
329334
330335 return engine_core_outputs , scheduler_output .total_num_scheduled_tokens > 0
331336
@@ -363,32 +368,49 @@ def step_with_batch_queue(
363368 model_executed = False
364369 deferred_scheduler_output = None
365370 if self .scheduler .has_requests ():
366- scheduler_output = self .scheduler .schedule ()
367- exec_future = self .model_executor .execute_model (
368- scheduler_output , non_block = True
369- )
371+ with record_function_or_nullcontext ("core step_with_batch_queue: schedule" ):
372+ scheduler_output = self .scheduler .schedule ()
373+ with record_function_or_nullcontext (
374+ "core step_with_batch_queue: execute_model"
375+ ):
376+ exec_future = self .model_executor .execute_model (
377+ scheduler_output , non_block = True
378+ )
370379 model_executed = scheduler_output .total_num_scheduled_tokens > 0
371380
372381 if scheduler_output .pending_structured_output_tokens :
373- # We need to defer sampling until we have processed the model output
374- # from the prior step.
375- deferred_scheduler_output = scheduler_output
376- # Block-wait for execute to return (continues running async on the GPU).
377- with self .log_error_detail (scheduler_output ):
378- exec_result = exec_future .result ()
379- assert exec_result is None
382+ with record_function_or_nullcontext (
383+ "core step_with_batch_queue: pending_structured_output_tokens"
384+ ):
385+ # We need to defer sampling until we have processed the model output
386+ # from the prior step.
387+ deferred_scheduler_output = scheduler_output
388+ # Block-wait for execute to return
389+ # (continues running async on the GPU).
390+ with self .log_error_detail (scheduler_output ):
391+ exec_result = exec_future .result ()
392+ assert exec_result is None
380393 else :
381- # We aren't waiting for any tokens, get any grammar output immediately.
382- grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
394+ with record_function_or_nullcontext (
395+ "core step_with_batch_queue: get_grammar_bitmask"
396+ ):
397+ # We aren't waiting for any tokens, get any grammar
398+ # output immediately.
399+ grammar_output = self .scheduler .get_grammar_bitmask (
400+ scheduler_output
401+ )
383402 # Block-wait for execute to return (continues running async on the GPU).
384403 with self .log_error_detail (scheduler_output ):
385404 exec_result = exec_future .result ()
386405
387406 if exec_result is None :
388- # Call sample tokens.
389- future = self .model_executor .sample_tokens (
390- grammar_output , non_block = True
391- )
407+ with record_function_or_nullcontext (
408+ "core step_with_batch_queue: sample_tokens"
409+ ):
410+ # Call sample tokens.
411+ future = self .model_executor .sample_tokens (
412+ grammar_output , non_block = True
413+ )
392414 else :
393415 # No sampling required (e.g. all requests finished).
394416 future = cast (Future [ModelRunnerOutput ], exec_future )
@@ -408,27 +430,34 @@ def step_with_batch_queue(
408430 # only be called when the scheduler contains requests or the queue
409431 # is non-empty.
410432 return None , False
411-
412- # Block until the next result is available.
413- future , scheduler_output = batch_queue .pop ()
414- with self .log_error_detail (scheduler_output ):
415- model_output = future .result ()
416-
417- engine_core_outputs = self .scheduler .update_from_output (
418- scheduler_output , model_output
419- )
433+ with record_function_or_nullcontext ("core step_with_batch_queue: model_output" ):
434+ # Block until the next result is available.
435+ future , scheduler_output = batch_queue .pop ()
436+ with self .log_error_detail (scheduler_output ):
437+ model_output = future .result ()
438+ with record_function_or_nullcontext (
439+ "core step_with_batch_queue: update_from_output"
440+ ):
441+ engine_core_outputs = self .scheduler .update_from_output (
442+ scheduler_output , model_output
443+ )
420444
421445 # NOTE(nick): We can either handle the deferred tasks here or save
422446 # in a field and do it immediately once step_with_batch_queue is
423447 # re-called. The latter slightly favors TTFT over TPOT/throughput.
424448 if deferred_scheduler_output :
425- # We now have the tokens needed to compute the bitmask for the
426- # deferred request. Get the bitmask and call sample tokens.
427- grammar_output = self .scheduler .get_grammar_bitmask (
428- deferred_scheduler_output
429- )
430- future = self .model_executor .sample_tokens (grammar_output , non_block = True )
431- batch_queue .appendleft ((future , deferred_scheduler_output ))
449+ with record_function_or_nullcontext (
450+ "core step_with_batch_queue: deferred_scheduler_output"
451+ ):
452+ # We now have the tokens needed to compute the bitmask for the
453+ # deferred request. Get the bitmask and call sample tokens.
454+ grammar_output = self .scheduler .get_grammar_bitmask (
455+ deferred_scheduler_output
456+ )
457+ future = self .model_executor .sample_tokens (
458+ grammar_output , non_block = True
459+ )
460+ batch_queue .appendleft ((future , deferred_scheduler_output ))
432461
433462 return engine_core_outputs , model_executed
434463
0 commit comments