PaddlePaddle · EmmonsCurse · Oct 15, 2025 · Oct 14, 2025
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -58,10 +58,12 @@ class RequestFuncOutput:
     """Output for requesting LLMs via API"""
 
     no: int = 0
+    request_id: str = ""
     generated_text: str = ""
     reasoning_content: str = ""
     success: bool = False
     latency: float = 0.0
+    end_timestamp: float = 0.0  # 模型完全返回的时间戳（秒, perf_counter基准）
     output_tokens: int = 0
     ttft: float = 0.0  # Time to first token
     arrival_time: list = field(default_factory=list)  # arrival_time
@@ -154,6 +156,8 @@ async def async_request_eb_openai_chat_completions(
                             most_recent_timestamp = timestamp
 
                     # output.generated_text = generated_text
+                    # 在流式结束时，记录最后一个 chunk 收到的时间戳
+                    output.end_timestamp = most_recent_timestamp
                     if output.generated_text.strip() == "":
                         output.success = False
                         output.error = "No generated text found!"
@@ -170,6 +174,7 @@ async def async_request_eb_openai_chat_completions(
                     )
                     output.error = error_text or ""
                     output.success = False
+                output.request_id = data.get("id", "")
         except Exception:
             output.success = False
             exc_info = sys.exc_info()

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -150,7 +150,7 @@ async def get_request(
 
 
 def calculate_metrics(
-    input_requests: list[SampleRequest],
+    # input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
     dur_s: float,
     selected_percentiles: list[float],
@@ -395,6 +395,7 @@ async def benchmark(
     print(f"Traffic request rate: {request_rate}")
     print(f"Burstiness factor: {burstiness} ({distribution})")
     print(f"Maximum request concurrency: {max_concurrency}")
+    print(f"Drop ratio: {args.drop_ratio}")
 
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
@@ -443,6 +444,8 @@ async def limited_request_func(request_func_input, pbar):
         tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
     outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
+    outputs.sort(key=lambda x: x.end_timestamp)
+
     if profile:
         print("Stopping profiler...")
         profile_input = RequestFuncInput(
@@ -460,11 +463,30 @@ async def limited_request_func(request_func_input, pbar):
     if pbar is not None:
         pbar.close()
 
-    benchmark_duration = time.perf_counter() - benchmark_start_time
-    print("benchmark_duration:", benchmark_duration)
+    drop_ratio = args.drop_ratio
+    if 0.0 < drop_ratio < 1:
+        # 按drop_ratio头尾各舍弃一半请求，不计入benchmark统计
+        n = len(outputs)
+        drop_count = int(n * drop_ratio)
+        half = drop_count // 2
+        if half > 0:
+            outputs = outputs[half : n - half]
+
+        # 根据收到最后一个chunk的时间戳计算总时长
+        if len(outputs) >= 2:
+            benchmark_duration = outputs[-1].end_timestamp - outputs[0].end_timestamp
+        else:
+            benchmark_duration = 0.0
+
+        print(f"丢弃前数量: {n}")
+        print(f"丢弃后数量: {len(outputs)}")
+        print(f"benchmark_duration: {benchmark_duration} 秒")
+    else:
+        benchmark_duration = time.perf_counter() - benchmark_start_time
+        print(f"benchmark_duration: {benchmark_duration} 秒")
 
     metrics, actual_output_lens = calculate_metrics(
-        input_requests=input_requests,
+        # input_requests=input_requests,
         outputs=outputs,
         dur_s=benchmark_duration,
         # tokenizer=tokenizer,
@@ -1081,6 +1103,12 @@ def main(args: argparse.Namespace):
         action="store_true",
         help="shuffle dataset",
     )
+    parser.add_argument(
+        "--drop-ratio",
+        type=float,
+        default=0.0,
+        help="Drop ratio of the outputs. [0, 1)",
+    )
     parser.add_argument(
         "--trust-remote-code",
         action="store_true",