From d715d59c8b2e60cd7e8341133c85a459c5a7604a Mon Sep 17 00:00:00 2001 From: Tianmu Li Date: Wed, 8 Oct 2025 21:22:03 +0300 Subject: [PATCH 1/5] Fix issue with async_scheduling when dealing with chunked input Signed-off-by: Tianmu Li --- vllm_gaudi/v1/worker/hpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 877e1583..5bfb3078 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -2989,12 +2989,12 @@ def execute_model( # If logits_indices is smaller than req_id, # add the last token position if logits_indices.shape[0] < len(req_id): - if structured_output: + if structured_output or self.use_async_scheduling: logits_append = torch.tensor([torch.sum(prompt_len) - 1], device=token_ids.device, dtype=torch.int32) logits_indices = torch.cat([logits_indices, logits_append]) - elif self.use_async_scheduling: + if self.use_async_scheduling: # Discard partial prefill logits for async scheduling # Depends on 1 decode token/batch invalid_req_indices.append(num_decodes + idx) @@ -3299,7 +3299,7 @@ def execute_model( return AsyncHPUModelRunnerOutput( model_runner_output=model_runner_output, sampled_token_ids=sampled_token_ids, - invalid_req_indices=[], + invalid_req_indices=invalid_req_indices, async_output_copy_stream=self.async_output_copy_stream, ) model_runner_output = ModelRunnerOutput( From f075944eaec7fc979b4b85ec61a679bbd39c583c Mon Sep 17 00:00:00 2001 From: Tianmu Li Date: Wed, 8 Oct 2025 22:32:23 +0300 Subject: [PATCH 2/5] Dummy commit Signed-off-by: Tianmu Li --- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 5bfb3078..f75f0681 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -2995,7 +2995,7 @@ def execute_model( dtype=torch.int32) logits_indices = torch.cat([logits_indices, logits_append]) if self.use_async_scheduling: - # Discard partial prefill logits for async scheduling + # Discard partial prefill logit for async scheduling # Depends on 1 decode token/batch invalid_req_indices.append(num_decodes + idx) htorch.core.mark_step() From 9999809e7e0d46ec7f7b57ef5830e1251f8c9356 Mon Sep 17 00:00:00 2001 From: Tianmu Li Date: Thu, 9 Oct 2025 02:40:04 +0300 Subject: [PATCH 3/5] Clarify invalid_req_indices Signed-off-by: Tianmu Li --- vllm_gaudi/v1/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index f75f0681..02266ae0 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -2997,7 +2997,8 @@ def execute_model( if self.use_async_scheduling: # Discard partial prefill logit for async scheduling # Depends on 1 decode token/batch - invalid_req_indices.append(num_decodes + idx) + prefill_start_idx = num_decodes + invalid_req_indices.append(prefill_start_idx + idx) htorch.core.mark_step() non_flattened_hidden_states, aux_hidden_states, \ sample_hidden_states, logits_device = \ From 62a27b176f1f0d4d38a6b69cbb0fbbfa3628862f Mon Sep 17 00:00:00 2001 From: Tianmu Li Date: Tue, 14 Oct 2025 01:29:13 +0300 Subject: [PATCH 4/5] Add NOTE Signed-off-by: Tianmu Li --- vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 85da5588..c91e544e 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -2993,7 +2993,7 @@ def execute_model( self.event_start = self.profiler.get_timestamp_us() self.profiler.start("internal", "prefill") - # Align behavior of incomplete prompt with gpu_model_runner + # NOTE(tianmu-li): Align behavior of incomplete prompt with gpu_model_runner # If logits_indices is smaller than req_id, # add the last token position if logits_indices.shape[0] < len(req_id): From 331b117887f4f422578f3a371806a2b88a7a5666 Mon Sep 17 00:00:00 2001 From: Tianmu Li Date: Tue, 14 Oct 2025 01:53:39 +0300 Subject: [PATCH 5/5] Add clarification text Signed-off-by: Tianmu Li --- vllm_gaudi/v1/worker/hpu_model_runner.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index c91e544e..d869ce50 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -2994,14 +2994,18 @@ def execute_model( self.event_start = self.profiler.get_timestamp_us() self.profiler.start("internal", "prefill") # NOTE(tianmu-li): Align behavior of incomplete prompt with gpu_model_runner - # If logits_indices is smaller than req_id, - # add the last token position + # If logits_indices is smaller than req_id, the last request is a chunked prompt request that + # hasn't finished in this step. We add the last token position to logits_indices to ensure + # the last token of the chunk is sampled. This sampled token will be discarded later if logits_indices.shape[0] < len(req_id): if structured_output or self.use_async_scheduling: - logits_append = torch.tensor([torch.sum(prompt_len) - 1], - device=token_ids.device, - dtype=torch.int32) - logits_indices = torch.cat([logits_indices, logits_append]) + # When there are multiple requests in the batch (e.g. self.use_merged_prefill=True), + # the last token position is the sum of all prompt lengths - 1 + # This logic also holds when there is only one request in the batch + logits_indices_append = torch.tensor([torch.sum(prompt_len) - 1], + device=token_ids.device, + dtype=torch.int32) + logits_indices = torch.cat([logits_indices, logits_indices_append]) if self.use_async_scheduling: # Discard partial prefill logit for async scheduling # Depends on 1 decode token/batch