Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,12 @@ class RequestFuncOutput:
"""Output for requesting LLMs via API"""

no: int = 0
request_id: str = ""
generated_text: str = ""
reasoning_content: str = ""
success: bool = False
latency: float = 0.0
end_timestamp: float = 0.0 # 模型完全返回的时间戳(秒, perf_counter基准)
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
arrival_time: list = field(default_factory=list) # arrival_time
Expand Down Expand Up @@ -154,6 +156,8 @@ async def async_request_eb_openai_chat_completions(
most_recent_timestamp = timestamp

# output.generated_text = generated_text
# 在流式结束时,记录最后一个 chunk 收到的时间戳
output.end_timestamp = most_recent_timestamp
if output.generated_text.strip() == "":
output.success = False
output.error = "No generated text found!"
Expand All @@ -170,6 +174,7 @@ async def async_request_eb_openai_chat_completions(
)
output.error = error_text or ""
output.success = False
output.request_id = data.get("id", "")
except Exception:
output.success = False
exc_info = sys.exc_info()
Expand Down
36 changes: 32 additions & 4 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ async def get_request(


def calculate_metrics(
input_requests: list[SampleRequest],
# input_requests: list[SampleRequest],
outputs: list[RequestFuncOutput],
dur_s: float,
selected_percentiles: list[float],
Expand Down Expand Up @@ -395,6 +395,7 @@ async def benchmark(
print(f"Traffic request rate: {request_rate}")
print(f"Burstiness factor: {burstiness} ({distribution})")
print(f"Maximum request concurrency: {max_concurrency}")
print(f"Drop ratio: {args.drop_ratio}")

pbar = None if disable_tqdm else tqdm(total=len(input_requests))

Expand Down Expand Up @@ -443,6 +444,8 @@ async def limited_request_func(request_func_input, pbar):
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)

outputs.sort(key=lambda x: x.end_timestamp)

if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
Expand All @@ -460,11 +463,30 @@ async def limited_request_func(request_func_input, pbar):
if pbar is not None:
pbar.close()

benchmark_duration = time.perf_counter() - benchmark_start_time
print("benchmark_duration:", benchmark_duration)
drop_ratio = args.drop_ratio
if 0.0 < drop_ratio < 1:
# 按drop_ratio头尾各舍弃一半请求,不计入benchmark统计
n = len(outputs)
drop_count = int(n * drop_ratio)
half = drop_count // 2
if half > 0:
outputs = outputs[half : n - half]

# 根据收到最后一个chunk的时间戳计算总时长
if len(outputs) >= 2:
benchmark_duration = outputs[-1].end_timestamp - outputs[0].end_timestamp
else:
benchmark_duration = 0.0

print(f"丢弃前数量: {n}")
print(f"丢弃后数量: {len(outputs)}")
print(f"benchmark_duration: {benchmark_duration} 秒")
else:
benchmark_duration = time.perf_counter() - benchmark_start_time
print(f"benchmark_duration: {benchmark_duration} 秒")

metrics, actual_output_lens = calculate_metrics(
input_requests=input_requests,
# input_requests=input_requests,
outputs=outputs,
dur_s=benchmark_duration,
# tokenizer=tokenizer,
Expand Down Expand Up @@ -1081,6 +1103,12 @@ def main(args: argparse.Namespace):
action="store_true",
help="shuffle dataset",
)
parser.add_argument(
"--drop-ratio",
type=float,
default=0.0,
help="Drop ratio of the outputs. [0, 1)",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
Expand Down
Loading