From d715d59c8b2e60cd7e8341133c85a459c5a7604a Mon Sep 17 00:00:00 2001
From: Tianmu Li <tianmu.li@intel.com>
Date: Wed, 8 Oct 2025 21:22:03 +0300
Subject: [PATCH 1/5] Fix issue with async_scheduling when dealing with chunked
 input

Signed-off-by: Tianmu Li <tianmu.li@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 877e1583..5bfb3078 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -2989,12 +2989,12 @@ def execute_model(
                 # If logits_indices is smaller than req_id,
                 # add the last token position
                 if logits_indices.shape[0] < len(req_id):
-                    if structured_output:
+                    if structured_output or self.use_async_scheduling:
                         logits_append = torch.tensor([torch.sum(prompt_len) - 1],
                                                      device=token_ids.device,
                                                      dtype=torch.int32)
                         logits_indices = torch.cat([logits_indices, logits_append])
-                    elif self.use_async_scheduling:
+                    if self.use_async_scheduling:
                         # Discard partial prefill logits for async scheduling
                         # Depends on 1 decode token/batch
                         invalid_req_indices.append(num_decodes + idx)
@@ -3299,7 +3299,7 @@ def execute_model(
             return AsyncHPUModelRunnerOutput(
                 model_runner_output=model_runner_output,
                 sampled_token_ids=sampled_token_ids,
-                invalid_req_indices=[],
+                invalid_req_indices=invalid_req_indices,
                 async_output_copy_stream=self.async_output_copy_stream,
             )
         model_runner_output = ModelRunnerOutput(

From f075944eaec7fc979b4b85ec61a679bbd39c583c Mon Sep 17 00:00:00 2001
From: Tianmu Li <tianmu.li@intel.com>
Date: Wed, 8 Oct 2025 22:32:23 +0300
Subject: [PATCH 2/5] Dummy commit

Signed-off-by: Tianmu Li <tianmu.li@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 5bfb3078..f75f0681 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -2995,7 +2995,7 @@ def execute_model(
                                                      dtype=torch.int32)
                         logits_indices = torch.cat([logits_indices, logits_append])
                     if self.use_async_scheduling:
-                        # Discard partial prefill logits for async scheduling
+                        # Discard partial prefill logit for async scheduling
                         # Depends on 1 decode token/batch
                         invalid_req_indices.append(num_decodes + idx)
                 htorch.core.mark_step()

From 9999809e7e0d46ec7f7b57ef5830e1251f8c9356 Mon Sep 17 00:00:00 2001
From: Tianmu Li <tianmu.li@intel.com>
Date: Thu, 9 Oct 2025 02:40:04 +0300
Subject: [PATCH 3/5] Clarify invalid_req_indices

Signed-off-by: Tianmu Li <tianmu.li@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index f75f0681..02266ae0 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -2997,7 +2997,8 @@ def execute_model(
                     if self.use_async_scheduling:
                         # Discard partial prefill logit for async scheduling
                         # Depends on 1 decode token/batch
-                        invalid_req_indices.append(num_decodes + idx)
+                        prefill_start_idx = num_decodes
+                        invalid_req_indices.append(prefill_start_idx + idx)
                 htorch.core.mark_step()
                 non_flattened_hidden_states, aux_hidden_states, \
                     sample_hidden_states, logits_device = \

From 62a27b176f1f0d4d38a6b69cbb0fbbfa3628862f Mon Sep 17 00:00:00 2001
From: Tianmu Li <tianmu.li@intel.com>
Date: Tue, 14 Oct 2025 01:29:13 +0300
Subject: [PATCH 4/5] Add NOTE

Signed-off-by: Tianmu Li <tianmu.li@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 85da5588..c91e544e 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -2993,7 +2993,7 @@ def execute_model(
 
                 self.event_start = self.profiler.get_timestamp_us()
                 self.profiler.start("internal", "prefill")
-                # Align behavior of incomplete prompt with gpu_model_runner
+                # NOTE(tianmu-li): Align behavior of incomplete prompt with gpu_model_runner
                 # If logits_indices is smaller than req_id,
                 # add the last token position
                 if logits_indices.shape[0] < len(req_id):

From 331b117887f4f422578f3a371806a2b88a7a5666 Mon Sep 17 00:00:00 2001
From: Tianmu Li <tianmu.li@intel.com>
Date: Tue, 14 Oct 2025 01:53:39 +0300
Subject: [PATCH 5/5] Add clarification text

Signed-off-by: Tianmu Li <tianmu.li@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index c91e544e..d869ce50 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -2994,14 +2994,18 @@ def execute_model(
                 self.event_start = self.profiler.get_timestamp_us()
                 self.profiler.start("internal", "prefill")
                 # NOTE(tianmu-li): Align behavior of incomplete prompt with gpu_model_runner
-                # If logits_indices is smaller than req_id,
-                # add the last token position
+                # If logits_indices is smaller than req_id, the last request is a chunked prompt request that
+                # hasn't finished in this step. We add the last token position to logits_indices to ensure
+                # the last token of the chunk is sampled. This sampled token will be discarded later
                 if logits_indices.shape[0] < len(req_id):
                     if structured_output or self.use_async_scheduling:
-                        logits_append = torch.tensor([torch.sum(prompt_len) - 1],
-                                                     device=token_ids.device,
-                                                     dtype=torch.int32)
-                        logits_indices = torch.cat([logits_indices, logits_append])
+                        # When there are multiple requests in the batch (e.g. self.use_merged_prefill=True),
+                        # the last token position is the sum of all prompt lengths - 1
+                        # This logic also holds when there is only one request in the batch
+                        logits_indices_append = torch.tensor([torch.sum(prompt_len) - 1],
+                                                             device=token_ids.device,
+                                                             dtype=torch.int32)
+                        logits_indices = torch.cat([logits_indices, logits_indices_append])
                     if self.use_async_scheduling:
                         # Discard partial prefill logit for async scheduling
                         # Depends on 1 decode token/batch