sgl-project · hnyls2002 · Oct 7, 2025 · Oct 4, 2025 · Oct 4, 2025 · Oct 4, 2025
@@ -112,8 +112,10 @@
     UpdateWeightsFromTensorReqInput,
 )
 from sglang.srt.managers.mm_utils import init_embedding_cache
+from sglang.srt.managers.overlap_utils import FutureMap
 from sglang.srt.managers.schedule_batch import (
     FINISH_ABORT,
+    ModelWorkerBatch,
     MultimodalInputs,
     Req,
     RequestStage,
@@ -210,6 +212,9 @@ class GenerationBatchResult:
     extend_input_len_per_req: List[int]
     extend_logprob_start_len_per_req: List[int]
 
+    # For overlap scheduling
+    copy_done: Optional[torch.cuda.Event] = None
+
     @classmethod
     def from_forward_batch_output(
         cls,
@@ -226,6 +231,7 @@ def from_forward_batch_output(
             extend_input_len_per_req=extend_input_len_per_req,
             extend_logprob_start_len_per_req=extend_logprob_start_len_per_req,
             can_run_cuda_graph=forward_batch_output.can_run_cuda_graph,
+            copy_done=forward_batch_output.copy_done,
         )
 
     @classmethod
@@ -386,12 +392,8 @@ def __init__(
             logger.info("Overlap scheduler is disabled for embedding models.")
 
         # Launch a tensor parallel worker
-        if self.enable_overlap:
-            TpWorkerClass = TpModelWorkerClient
-        else:
-            TpWorkerClass = TpModelWorker
 
-        self.tp_worker = TpWorkerClass(
+        self.tp_worker = TpModelWorker(
             server_args=server_args,
             gpu_id=gpu_id,
             tp_rank=tp_rank,
@@ -616,6 +618,9 @@ def __init__(
         # Init prefill kv split size when deterministic inference is enabled with various attention backends
         self.init_deterministic_inference_config()
 
+        # Init overlap
+        self.init_overlap()
+
         # Init request dispatcher
         self._request_dispatcher = TypeBasedDispatcher(
             [
@@ -928,6 +933,21 @@ def init_disaggregation(self):
             # The prefill requests that are in the middle of kv sending
             self.disagg_prefill_inflight_queue: List[Req] = []
 
+    def init_overlap(self):
+        if not self.enable_overlap:
+            return
+
+        self.forward_stream = torch.get_device_module(self.device).Stream()
+        self.forward_stream_ctx = torch.get_device_module(self.device).stream(
+            self.forward_stream
+        )
+        self.copy_stream = torch.get_device_module(self.device).Stream()
+        self.copy_stream_ctx = torch.get_device_module(self.device).stream(
+            self.copy_stream
+        )
+
+        self.future_map = FutureMap(self.max_running_requests, self.device)
+
     def init_moe_config(self):
         if hasattr(self.model_config.hf_config, "num_experts_per_tok"):
             initialize_moe_config(self.server_args)
@@ -2031,10 +2051,109 @@ def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
         batch.prepare_for_decode()
         return batch
 
+    def run_batch_exp(
+        self, batch: ScheduleBatch
+    ) -> Union[GenerationBatchResult, EmbeddingBatchResult]:
+        """Run a batch."""
+        self.forward_ct += 1
+
+        # Whether to run the profiler
+        self._profile_batch_predicate(batch)
+        if self.forward_sleep_time is not None:
+            logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
+            time.sleep(self.forward_sleep_time)
+
+        # Run forward
+        if self.is_generation:
+
+            batch_or_worker_batch = batch
+
+            if self.spec_algorithm.is_none():
+                # FIXME(lsyin): remove this if and finally unify the abstraction
+                batch_or_worker_batch = batch.get_model_worker_batch()
+
+            if self.enable_overlap:
+                # FIXME: remove this assert
+                assert isinstance(batch_or_worker_batch, ModelWorkerBatch)
+                model_worker_batch = batch_or_worker_batch
+
+                # Sampling info will be modified during forward
+                model_worker_batch.sampling_info = self.tp_worker.cur_sampling_info = (
+                    model_worker_batch.sampling_info.copy_for_forward()
+                )
+
+                bs = len(model_worker_batch.seq_lens)
+                cur_future_map_ct = self.future_map.update_ct(bs)
+
+                with self.forward_stream_ctx:
+                    self.future_map.resolve_future(model_worker_batch)
+                    forward_batch_output = self.model_worker.forward_batch_generation(
+                        batch_or_worker_batch
+                    )
+                    next_token_ids = forward_batch_output.next_token_ids
+                    self.future_map.store_to_map(cur_future_map_ct, bs, next_token_ids)
+
+                    copy_done = torch.cuda.Event()
+                    copy_done.record()
+
+                    # FIXME(lsyin): move copy_done elsewhere
+                    forward_batch_output.copy_done = copy_done
+
+                # FIXME(lsyin): move this assignment elsewhere
+                forward_batch_output.next_token_ids = (
+                    self.future_map.update_next_future(cur_future_map_ct, bs)
+                )
+            else:
+                forward_batch_output = self.model_worker.forward_batch_generation(
+                    batch_or_worker_batch
+                )
+                copy_done = None
+
+            if not self.spec_algorithm.is_none():
+                # TODO(lsyin): unify this metric-updating logic with non-spec, and move it to decode processing
+                self.udpate_spec_metrics(
+                    batch.batch_size(), forward_batch_output.num_accepted_tokens
+                )
+
+            # update batch's output ids
+            batch.output_ids = forward_batch_output.next_token_ids
+
+            # print(f"[Run Batch]: {batch.seq_lens_cpu=}")
+            # print(f"[Run Batch]: {batch.input_ids=}")
+            # print(f"[Output Ids]: {batch.output_ids}")
+
+            # These 2 values are needed for processing the output, but the values can be
+            # modified by overlap schedule. So we have to copy them here so that
+            # we can use the correct values in output processing.
+            if batch.return_logprob or self.spec_algorithm.is_eagle():
+                extend_input_len_per_req = [req.extend_input_len for req in batch.reqs]
+            else:
+                extend_input_len_per_req = None
+
+            if batch.return_logprob:
+                extend_logprob_start_len_per_req = [
+                    req.extend_logprob_start_len for req in batch.reqs
+                ]
+            else:
+                extend_logprob_start_len_per_req = None
+
+            return GenerationBatchResult.from_forward_batch_output(
+                forward_batch_output=forward_batch_output,
+                extend_input_len_per_req=extend_input_len_per_req,
+                extend_logprob_start_len_per_req=extend_logprob_start_len_per_req,
+            )
+        else:  # embedding or reward model
+            model_worker_batch = batch.get_model_worker_batch()
+            embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch)
+            ret = EmbeddingBatchResult(embeddings=embeddings)
+        return ret
+
     def run_batch(
         self, batch: ScheduleBatch
     ) -> Union[GenerationBatchResult, EmbeddingBatchResult]:
         """Run a batch."""
+        return self.run_batch_exp(batch)
+
         self.forward_ct += 1
 
         # Whether to run the profiler

@@ -49,29 +49,29 @@ def process_batch_result_prefill(
                 next_token_ids,
                 extend_input_len_per_req,
                 extend_logprob_start_len_per_req,
+                copy_done,
             ) = (
                 result.logits_output,
                 result.next_token_ids,
                 result.extend_input_len_per_req,
                 result.extend_logprob_start_len_per_req,
+                result.copy_done,
             )
 
-            if self.enable_overlap:
-                logits_output, next_token_ids, _ = (
-                    self.tp_worker.resolve_last_batch_result(launch_done)
-                )
-            else:
-                # Move next_token_ids and logprobs to cpu
-                next_token_ids = next_token_ids.tolist()
-                if batch.return_logprob:
-                    if logits_output.next_token_logprobs is not None:
-                        logits_output.next_token_logprobs = (
-                            logits_output.next_token_logprobs.tolist()
-                        )
-                    if logits_output.input_token_logprobs is not None:
-                        logits_output.input_token_logprobs = tuple(
-                            logits_output.input_token_logprobs.tolist()
-                        )
+            if copy_done is not None:
+                copy_done.synchronize()
+
+            # Move next_token_ids and logprobs to cpu
+            next_token_ids = next_token_ids.tolist()
+            if batch.return_logprob:
+                if logits_output.next_token_logprobs is not None:
+                    logits_output.next_token_logprobs = (
+                        logits_output.next_token_logprobs.tolist()
+                    )
+                if logits_output.input_token_logprobs is not None:
+                    logits_output.input_token_logprobs = tuple(
+                        logits_output.input_token_logprobs.tolist()
+                    )
 
             hidden_state_offset = 0
 
@@ -206,20 +206,18 @@ def process_batch_result_decode(
         result: GenerationBatchResult,
         launch_done: Optional[threading.Event] = None,
     ):
-        logits_output, next_token_ids, can_run_cuda_graph = (
+        logits_output, next_token_ids, can_run_cuda_graph, copy_done = (
             result.logits_output,
             result.next_token_ids,
             result.can_run_cuda_graph,
+            result.copy_done,
         )
         self.num_generated_tokens += len(batch.reqs)
 
-        if self.enable_overlap:
-            logits_output, next_token_ids, can_run_cuda_graph = (
-                self.tp_worker.resolve_last_batch_result(launch_done)
-            )
-            next_token_logprobs = logits_output.next_token_logprobs
-        elif batch.spec_algorithm.is_none():
-            # spec decoding handles output logprobs inside verify process.
+        if copy_done is not None:
+            copy_done.synchronize()
+
+        if batch.spec_algorithm.is_none():
             next_token_ids = next_token_ids.tolist()
             if batch.return_logprob:
                 next_token_logprobs = logits_output.next_token_logprobs.tolist()

@@ -234,12 +234,8 @@ def forward_batch_generation(
         self, model_worker_batch: ModelWorkerBatch
     ) -> ForwardBatchOutput:
         # Create a new copy of sampling_info because it will be updated in-place by the scheduler for the next batch.
-        sampling_info = model_worker_batch.sampling_info
-        sampling_info.update_penalties()
-        model_worker_batch.sampling_info = self.cur_sampling_info = dataclasses.replace(
-            sampling_info,
-            sampling_info_done=threading.Event(),
-            penalizer_orchestrator=None,
+        model_worker_batch.sampling_info = self.cur_sampling_info = (
+            model_worker_batch.sampling_info.copy_for_forward()
         )
 
         # A cuda stream sync here to avoid the cuda illegal memory access error.

@@ -910,6 +910,9 @@ class ForwardBatchOutput:
     pp_proxy_tensors: Optional[PPProxyTensors] = None
     can_run_cuda_graph: bool = False
 
+    # For overlap scheduling
+    copy_done: Optional[torch.cuda.Event] = None
+
 
 def enable_num_token_non_padded(server_args):
     return get_moe_expert_parallel_world_size() > 1

diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -370,6 +370,15 @@ def merge_batch(self, other: "SamplingBatchInfo"):
         self.need_top_k_sampling |= other.need_top_k_sampling
         self.need_min_p_sampling |= other.need_min_p_sampling
 
+    def copy_for_forward(self):
+        # Accumulate the penalty into a pre-allocated buffer to get rid of the dependency of `penalizer_orchestrator` later
+        self.update_penalties()
+        return dataclasses.replace(
+            self,
+            sampling_info_done=threading.Event(),
+            penalizer_orchestrator=None,
+        )
+
 
 def merge_bias_tensor(
     lhs: Optional[torch.Tensor],