fix mtp chunkprefill output, fix unit test

freeliuzc · freeliuzc · commit ae62b28bdbb6 · 2025-10-13T16:55:03.000+08:00
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -710,8 +710,11 @@ void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
 void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens,
                                       const paddle::Tensor& accept_num,
                                       const paddle::Tensor& not_need_stop,
+                                      const paddle::Tensor& seq_lens_decoder,
+                                      const paddle::Tensor& prompt_lens,
                                       int64_t rank_id,
-                                      bool save_each_rank);
+                                      bool save_each_rank,
+                                      bool skip_prefill);
 
 
 void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc
@@ -28,9 +28,12 @@
 void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
                                 const paddle::Tensor& accept_num,
                                 const paddle::Tensor& not_need_stop,
+                                const paddle::Tensor& seq_lens_decoder,
+                                const paddle::Tensor& prompt_lens,
                                 int64_t rank_id,
                                 int msg_queue_id,
-                                int save_each_rank) {
+                                int save_each_rank,
+                                bool skip_prefill) {
     // printf("enter save output");
     if (!save_each_rank && rank_id > 0) {
         return;
@@ -43,6 +46,11 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
     int64_t* accept_tokens_data = accept_tokens_cpu.data<int64_t>();
     int* accept_num_data = accept_num_cpu.data<int>();
 
+    auto seq_lens_decoder_cpu = seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
+    auto prompt_lens_cpu = prompt_lens.copy_to(paddle::CPUPlace(), true);
+    int* seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
+    int64_t* prompt_lens_data = prompt_lens_cpu.data<int64_t>();
+
     if (const char* inference_msg_queue_id_env_p =
             std::getenv("INFERENCE_MSG_QUEUE_ID")) {
         std::string inference_msg_queue_id_env_str(
@@ -95,7 +103,7 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
     msg_sed.mtext[1] = bsz;
 
     for (int i = 2; i < MAX_BSZ + 2; i++) {
-        if (i - 2 >= bsz) {
+        if (i - 2 >= bsz || (skip_prefill && seq_lens_decoder_data[i - 2] < prompt_lens_data[i - 2])) {
             msg_sed.mtext[i] = 0;
         } else {
             msg_sed.mtext[i] = (int)accept_num_data[i - 2];
@@ -125,32 +133,38 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
 void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens,
                                       const paddle::Tensor& accept_num,
                                       const paddle::Tensor& not_need_stop,
+                                      const paddle::Tensor& seq_lens_decoder,
+                                      const paddle::Tensor& prompt_lens,
                                       int64_t rank_id,
-                                      bool save_each_rank) {
+                                      bool save_each_rank,
+                                      bool skip_prefill) {
     SpeculateSaveWithOutputMsg(
-        accept_tokens, accept_num, not_need_stop, rank_id, 1, save_each_rank);
+        accept_tokens, accept_num, not_need_stop, seq_lens_decoder, prompt_lens, rank_id, 1, save_each_rank, skip_prefill);
 }
 
 void SpeculateSaveWithOutputMsgDynamic(const paddle::Tensor& accept_tokens,
                                        const paddle::Tensor& accept_num,
                                        const paddle::Tensor& not_need_stop,
+                                       const paddle::Tensor& seq_lens_decoder,
+                                       const paddle::Tensor& prompt_lens,
                                        int64_t rank_id,
                                        int msg_queue_id,
-                                       bool save_each_rank) {
+                                       bool save_each_rank,
+                                       bool skip_prefill) {
     SpeculateSaveWithOutputMsg(
-        accept_tokens, accept_num, not_need_stop, rank_id, msg_queue_id, save_each_rank);
+        accept_tokens, accept_num, not_need_stop, seq_lens_decoder, prompt_lens, rank_id, msg_queue_id, save_each_rank, skip_prefill);
 }
 
 PD_BUILD_STATIC_OP(speculate_save_output)
-    .Inputs({"accept_tokens", "accept_num", "not_need_stop"})
-    .Attrs({"rank_id: int64_t", "save_each_rank: bool"})
+    .Inputs({"accept_tokens", "accept_num", "not_need_stop", "seq_lens_decoder", "prompt_lens"})
+    .Attrs({"rank_id: int64_t", "save_each_rank: bool", "skip_prefill: bool"})
     .Outputs({"x_out"})
     .SetInplaceMap({{"accept_tokens", "x_out"}})
     .SetKernelFn(PD_KERNEL(SpeculateSaveWithOutputMsgStatic));
 
 PD_BUILD_STATIC_OP(speculate_save_output_dynamic)
-    .Inputs({"accept_tokens", "accept_num", "not_need_stop"})
-    .Attrs({"rank_id: int64_t", "msg_queue_id: int", "save_each_rank: bool"})
+    .Inputs({"accept_tokens", "accept_num", "not_need_stop", "seq_lens_decoder", "prompt_lens"})
+    .Attrs({"rank_id: int64_t", "msg_queue_id: int", "save_each_rank: bool", "skip_prefill: bool"})
     .Outputs({"x_out"})
     .SetInplaceMap({{"accept_tokens", "x_out"}})
     .SetKernelFn(PD_KERNEL(SpeculateSaveWithOutputMsgDynamic));
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
@@ -20,30 +20,33 @@
 
 __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
                                                    const int64_t *accept_tokens,
-                                                   const int *accept_num,
+                                                   int *accept_num,
                                                    const bool *stop_flags,
                                                    const int *seq_lens_encoder,
-                                                   const int *seq_lens_decoder,
+                                                   int *seq_lens_decoder,
                                                    const int64_t *step_idx,
                                                    int bs,
                                                    int length,
                                                    int max_draft_tokens) {
     int tid = threadIdx.x;
-    if (tid < bs && !stop_flags[tid]) {
-        int64_t *pre_ids_all_now = pre_ids_all + tid * length;
-        const int64_t *accept_tokens_now =
-            accept_tokens + tid * max_draft_tokens;
-        const int seq_len_dec = seq_lens_decoder[tid];
-        const int seq_len_enc = seq_lens_encoder[tid];
-        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
-        // printf("step_idx[tid] %d\n", step_idx[tid]);
-        if (step_idx[tid] >= 0) {
-            for (int i = 0; i < accept_num[tid]; i++) {
-                pre_ids_all_now[step_idx[tid] - i] =
-                    accept_tokens_now[accept_num[tid] - 1 - i];
-                // printf("pre_ids_all_now[step_idx[tid] - i] %d \n",
-                // pre_ids_all_now[step_idx[tid] - i]);
+
+    if (tid < bs) {
+        if (!stop_flags[tid]) {
+            int64_t *pre_ids_all_now = pre_ids_all + tid * length;
+            const int64_t *accept_tokens_now =
+                accept_tokens + tid * max_draft_tokens;
+            const int seq_len_dec = seq_lens_decoder[tid];
+            const int seq_len_enc = seq_lens_encoder[tid];
+            if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stoped
+            if (step_idx[tid] >= 0) {
+                for (int i = 0; i < accept_num[tid]; i++) {
+                    pre_ids_all_now[step_idx[tid] - i] =
+                        accept_tokens_now[accept_num[tid] - 1 - i];
+                }
             }
+        } else {
+            accept_num[tid] = 0;
+            seq_lens_decoder[tid] = 0;
         }
     }
 }
@@ -67,10 +70,10 @@ void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
     speculate_set_value_by_flag_and_id<<<1, block_size, 0, cu_stream>>>(
         const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
         accept_tokens.data<int64_t>(),
-        accept_num.data<int>(),
+        const_cast<int*>(accept_num.data<int>()),
         stop_flags.data<bool>(),
         seq_lens_encoder.data<int>(),
-        seq_lens_decoder.data<int>(),
+        const_cast<int*>(seq_lens_decoder.data<int>()),
         step_idx.data<int64_t>(),
         bs,
         length,
@@ -86,6 +89,9 @@ PD_BUILD_STATIC_OP(speculate_set_value_by_flags_and_idx)
              "seq_lens_encoder",
              "seq_lens_decoder",
              "step_idx"})
-    .Outputs({"pre_ids_all_out"})
-    .SetInplaceMap({{"pre_ids_all", "pre_ids_all_out"}})
+    .Outputs({"pre_ids_all_out", "accept_num_out", "seq_lens_decoder_out"})
+    .SetInplaceMap({
+        {"pre_ids_all", "pre_ids_all_out"},
+        {"accept_num", "accept_num_out"},
+        {"seq_lens_decoder", "seq_lens_decoder_out"}})
     .SetKernelFn(PD_KERNEL(SpeculateSetValueByFlagsAndIdx));
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_update.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_update.cu
@@ -71,9 +71,6 @@ __global__ void speculate_update(int *seq_lens_encoder,
         }
         draft_tokens[bid * max_draft_tokens] =
             accept_tokens[bid * max_draft_tokens + accept_num_now - 1];
-        if (stop_flag_now_int) {
-            seq_lens_decoder[bid] = 0;
-        }
     } else if (bid >= real_bsz && bid < max_bsz) {
         stop_flag_now_int = 1;
     }
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
@@ -64,7 +64,6 @@
         save_output,
         save_output_topk,
         set_stop_value_multi_ends,
-        speculate_clear_accept_nums,
         speculate_get_output_padding_offset,
         speculate_get_padding_offset,
         speculate_get_seq_lens_output,
@@ -369,12 +368,13 @@ def post_process_specualate(
             model_output.accept_tokens,
             model_output.accept_num,
             model_output.not_need_stop,
+            model_output.seq_lens_decoder,
+            model_output.prompt_lens,
             model_output.mp_rank,
             save_each_rank,
+            envs.ENABLE_V1_KVCACHE_SCHEDULER,
         )
 
-    speculate_clear_accept_nums(model_output.accept_num, model_output.seq_lens_decoder)
-
     # Update pre_ids through accept tokens
 
     speculate_set_value_by_flags_and_idx(
diff --git a/fastdeploy/worker/output.py b/fastdeploy/worker/output.py
@@ -250,6 +250,11 @@ class ModelOutputData:
     """
     stop_seqs_len: paddle.Tensor = None
 
+    """
+        the length of input prompt
+    """
+    prompt_lens: paddle.Tensor = None
+
 
 @dataclass
 class ModelRunnerOutput:
diff --git a/tests/operators/test_speculative_schedule_cache.py b/tests/operators/test_speculative_schedule_cache.py
@@ -10,7 +10,9 @@ def cpu_reference(
     draft_tokens,
     block_tables,
     stop_flags,
+    prompt_lens,
     seq_lens_this_time,
+    seq_lens_encoder,
     seq_lens_decoder,
     step_seq_lens_decoder,
     step_draft_tokens,
@@ -101,7 +103,9 @@ def setUp(self):
         self.block_tables = paddle.to_tensor(np.full((self.real_bsz, self.block_num_per_seq), -1, dtype=np.int32))
         # stop_flags length is max_bsz, others are real_bsz
         self.stop_flags = paddle.to_tensor(np.array([False, True, False, False, False], dtype=np.bool_))
+        self.prompt_lens = paddle.to_tensor(np.array([1, 1, 1], dtype=np.int64))
         self.seq_lens_this_time = paddle.to_tensor(np.array([5, 6, 7], dtype=np.int32))
+        self.seq_lens_encoder = paddle.to_tensor(np.array([1, 1, 1], dtype=np.int32))
         self.seq_lens_decoder = paddle.to_tensor(np.array([1, 1, 10], dtype=np.int32))
 
         # Will be filled by kernel for the triggering bids only
@@ -129,7 +133,9 @@ def setUp(self):
         self.np_draft_tokens = self.draft_tokens.numpy().copy()
         self.np_block_tables = self.block_tables.numpy().copy()
         self.np_stop_flags = self.stop_flags.numpy().copy()
+        self.np_prompt_lens = self.prompt_lens.numpy().copy()
         self.np_seq_lens_this_time = self.seq_lens_this_time.numpy().copy()
+        self.np_seq_lens_encoder = self.seq_lens_encoder.numpy().copy()
         self.np_seq_lens_decoder = self.seq_lens_decoder.numpy().copy()
         self.np_step_seq_lens_decoder = self.step_seq_lens_decoder.numpy().copy()
         self.np_step_draft_tokens = self.step_draft_tokens.numpy().copy()
@@ -146,7 +152,9 @@ def test_correctness_against_cpu_reference(self):
             self.draft_tokens,
             self.block_tables,
             self.stop_flags,
+            self.prompt_lens,
             self.seq_lens_this_time,
+            self.seq_lens_encoder,
             self.seq_lens_decoder,
             self.step_seq_lens_decoder,
             self.step_draft_tokens,
@@ -165,7 +173,9 @@ def test_correctness_against_cpu_reference(self):
             self.np_draft_tokens,
             self.np_block_tables,
             self.np_stop_flags,
+            self.prompt_lens,
             self.np_seq_lens_this_time,
+            self.np_seq_lens_encoder,
             self.np_seq_lens_decoder,
             self.np_step_seq_lens_decoder,
             self.np_step_draft_tokens,
@@ -213,7 +223,9 @@ def test_no_trigger_path(self):
             self.draft_tokens,
             self.block_tables,
             self.stop_flags,
+            self.prompt_lens,
             self.seq_lens_this_time,
+            self.seq_lens_encoder,
             self.seq_lens_decoder,
             self.step_seq_lens_decoder,
             self.step_draft_tokens,

Original file line number	Diff line number	Diff line change
`@@ -71,9 +71,6 @@ __global__ void speculate_update(int *seq_lens_encoder,`
`71`	`71`	`}`
`72`	`72`	`draft_tokens[bid * max_draft_tokens] =`
`73`	`73`	`accept_tokens[bid * max_draft_tokens + accept_num_now - 1];`
`74`		`- if (stop_flag_now_int) {`
`75`		`- seq_lens_decoder[bid] = 0;`
`76`		`- }`
`77`	`74`	`} else if (bid >= real_bsz && bid < max_bsz) {`
`78`	`75`	`stop_flag_now_int = 1;`
`79`	`76`	`}`