3333from collections .abc import AsyncGenerator , Iterable
3434from dataclasses import dataclass
3535from datetime import datetime
36- from typing import Any , Optional
36+ from typing import Any , Literal , Optional
3737
3838import numpy as np
3939from tqdm .asyncio import tqdm
@@ -107,14 +107,42 @@ class BenchmarkMetrics:
107107 percentiles_e2el_ms : list [tuple [float , float ]]
108108
109109
110+ def _get_current_request_rate (
111+ ramp_up_strategy : Optional [Literal ["linear" , "exponential" ]],
112+ ramp_up_start_rps : Optional [int ],
113+ ramp_up_end_rps : Optional [int ],
114+ request_index : int ,
115+ total_requests : int ,
116+ request_rate : float ,
117+ ) -> float :
118+ if (
119+ ramp_up_strategy
120+ and ramp_up_start_rps is not None
121+ and ramp_up_end_rps is not None
122+ ):
123+ progress = request_index / max (total_requests - 1 , 1 )
124+ if ramp_up_strategy == "linear" :
125+ increase = (ramp_up_end_rps - ramp_up_start_rps ) * progress
126+ return ramp_up_start_rps + increase
127+ elif ramp_up_strategy == "exponential" :
128+ ratio = ramp_up_end_rps / ramp_up_start_rps
129+ return ramp_up_start_rps * (ratio ** progress )
130+ else :
131+ raise ValueError (f"Unknown ramp-up strategy: { ramp_up_strategy } " )
132+ return request_rate
133+
134+
110135async def get_request (
111136 input_requests : list [SampleRequest ],
112137 request_rate : float ,
113138 burstiness : float = 1.0 ,
114- ) -> AsyncGenerator [SampleRequest , None ]:
139+ ramp_up_strategy : Optional [Literal ["linear" , "exponential" ]] = None ,
140+ ramp_up_start_rps : Optional [int ] = None ,
141+ ramp_up_end_rps : Optional [int ] = None ,
142+ ) -> AsyncGenerator [tuple [SampleRequest , float ], None ]:
115143 """
116144 Asynchronously generates requests at a specified rate
117- with OPTIONAL burstiness.
145+ with OPTIONAL burstiness and OPTIONAL ramp-up strategy .
118146
119147 Args:
120148 input_requests:
@@ -129,22 +157,44 @@ async def get_request(
129157 A lower burstiness value (0 < burstiness < 1) results
130158 in more bursty requests, while a higher burstiness value
131159 (burstiness > 1) results in a more uniform arrival of requests.
160+ ramp_up_strategy (optional):
161+ The ramp-up strategy. Can be "linear" or "exponential".
162+ If None, uses constant request rate (specified by request_rate).
163+ ramp_up_start_rps (optional):
164+ The starting request rate for ramp-up.
165+ ramp_up_end_rps (optional):
166+ The ending request rate for ramp-up.
132167 """
133- input_requests : Iterable [SampleRequest ] = iter (input_requests )
134-
135- # Calculate scale parameter theta to maintain the desired request_rate.
136168 assert burstiness > 0 , (
137169 f"A positive burstiness factor is expected, but given { burstiness } ."
138170 )
139- theta = 1.0 / (request_rate * burstiness )
171+ # Convert to list to get length for ramp-up calculations
172+ if isinstance (input_requests , Iterable ) and not isinstance (input_requests , list ):
173+ input_requests = list (input_requests )
174+
175+ total_requests = len (input_requests )
176+ request_index = 0
140177
141178 for request in input_requests :
142- yield request
179+ current_request_rate = _get_current_request_rate (
180+ ramp_up_strategy ,
181+ ramp_up_start_rps ,
182+ ramp_up_end_rps ,
183+ request_index ,
184+ total_requests ,
185+ request_rate ,
186+ )
187+
188+ yield request , current_request_rate
143189
144- if request_rate == float ("inf" ):
190+ request_index += 1
191+
192+ if current_request_rate == float ("inf" ):
145193 # If the request rate is infinity, then we don't need to wait.
146194 continue
147195
196+ theta = 1.0 / (current_request_rate * burstiness )
197+
148198 # Sample the request interval from the gamma distribution.
149199 # If burstiness is 1, it follows exponential distribution.
150200 interval = np .random .gamma (shape = burstiness , scale = theta )
@@ -290,6 +340,9 @@ async def benchmark(
290340 max_concurrency : Optional [int ],
291341 lora_modules : Optional [Iterable [str ]],
292342 extra_body : Optional [dict ],
343+ ramp_up_strategy : Optional [Literal ["linear" , "exponential" ]] = None ,
344+ ramp_up_start_rps : Optional [int ] = None ,
345+ ramp_up_end_rps : Optional [int ] = None ,
293346):
294347 if backend in ASYNC_REQUEST_FUNCS :
295348 request_func = ASYNC_REQUEST_FUNCS [backend ]
@@ -353,7 +406,15 @@ async def benchmark(
353406
354407 distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
355408
356- print (f"Traffic request rate: { request_rate } " )
409+ if ramp_up_strategy is not None :
410+ print (
411+ f"Traffic ramp-up strategy: { ramp_up_strategy } . Will increase "
412+ f"RPS from { ramp_up_start_rps } to { ramp_up_end_rps } RPS over "
413+ "the duration of the benchmark."
414+ )
415+ else :
416+ print (f"Traffic request rate: { request_rate } RPS." )
417+
357418 print (f"Burstiness factor: { burstiness } ({ distribution } )" )
358419 print (f"Maximum request concurrency: { max_concurrency } " )
359420
@@ -373,7 +434,34 @@ async def limited_request_func(request_func_input, pbar):
373434
374435 benchmark_start_time = time .perf_counter ()
375436 tasks : list [asyncio .Task ] = []
376- async for request in get_request (input_requests , request_rate , burstiness ):
437+
438+ rps_change_events = []
439+ last_int_rps = - 1
440+ if ramp_up_strategy is not None and ramp_up_start_rps is not None :
441+ last_int_rps = ramp_up_start_rps
442+ rps_change_events .append (
443+ {
444+ "rps" : last_int_rps ,
445+ "timestamp" : datetime .now ().isoformat (),
446+ }
447+ )
448+
449+ async for request , current_request_rate in get_request (
450+ input_requests ,
451+ request_rate ,
452+ burstiness ,
453+ ramp_up_strategy ,
454+ ramp_up_start_rps ,
455+ ramp_up_end_rps ,
456+ ):
457+ if ramp_up_strategy is not None :
458+ current_int_rps = int (current_request_rate )
459+ if current_int_rps > last_int_rps :
460+ timestamp = datetime .now ().isoformat ()
461+ for rps_val in range (last_int_rps + 1 , current_int_rps + 1 ):
462+ rps_change_events .append ({"rps" : rps_val , "timestamp" : timestamp })
463+ last_int_rps = current_int_rps
464+
377465 prompt , prompt_len , output_len , mm_content = (
378466 request .prompt ,
379467 request .prompt_len ,
@@ -397,11 +485,8 @@ async def limited_request_func(request_func_input, pbar):
397485 ignore_eos = ignore_eos ,
398486 extra_body = extra_body ,
399487 )
400- tasks .append (
401- asyncio .create_task (
402- limited_request_func (request_func_input = request_func_input , pbar = pbar )
403- )
404- )
488+ task = limited_request_func (request_func_input = request_func_input , pbar = pbar )
489+ tasks .append (asyncio .create_task (task ))
405490 outputs : list [RequestFuncOutput ] = await asyncio .gather (* tasks )
406491
407492 if profile :
@@ -477,6 +562,9 @@ async def limited_request_func(request_func_input, pbar):
477562 "errors" : [output .error for output in outputs ],
478563 }
479564
565+ if rps_change_events :
566+ result ["rps_change_events" ] = rps_change_events
567+
480568 def process_one_metric (
481569 # E.g., "ttft"
482570 metric_attribute_name : str ,
@@ -610,6 +698,26 @@ def main(args: argparse.Namespace):
610698 tokenizer_id = args .tokenizer if args .tokenizer is not None else args .model
611699 tokenizer_mode = args .tokenizer_mode
612700
701+ # Validate ramp-up arguments
702+ if args .ramp_up_strategy is not None :
703+ if args .request_rate != float ("inf" ):
704+ raise ValueError (
705+ "When using ramp-up, do not specify --request-rate. "
706+ "The request rate will be controlled by ramp-up parameters. "
707+ "Please remove the --request-rate argument."
708+ )
709+ if args .ramp_up_start_rps is None or args .ramp_up_end_rps is None :
710+ raise ValueError (
711+ "When using --ramp-up-strategy, both --ramp-up-start-rps and "
712+ "--ramp-up-end-rps must be specified"
713+ )
714+ if args .ramp_up_start_rps < 0 or args .ramp_up_end_rps < 0 :
715+ raise ValueError ("Ramp-up start and end RPS must be non-negative" )
716+ if args .ramp_up_start_rps > args .ramp_up_end_rps :
717+ raise ValueError ("Ramp-up start RPS must be less than end RPS" )
718+ if args .ramp_up_strategy == "exponential" and args .ramp_up_start_rps == 0 :
719+ raise ValueError ("For exponential ramp-up, the start RPS cannot be 0." )
720+
613721 if args .base_url is not None :
614722 api_url = f"{ args .base_url } { args .endpoint } "
615723 base_url = f"{ args .base_url } "
@@ -802,6 +910,9 @@ def main(args: argparse.Namespace):
802910 max_concurrency = args .max_concurrency ,
803911 lora_modules = args .lora_modules ,
804912 extra_body = sampling_params ,
913+ ramp_up_strategy = args .ramp_up_strategy ,
914+ ramp_up_start_rps = args .ramp_up_start_rps ,
915+ ramp_up_end_rps = args .ramp_up_end_rps ,
805916 )
806917 )
807918
@@ -834,6 +945,11 @@ def main(args: argparse.Namespace):
834945 result_json ["burstiness" ] = args .burstiness
835946 result_json ["max_concurrency" ] = args .max_concurrency
836947
948+ if args .ramp_up_strategy is not None :
949+ result_json ["ramp_up_strategy" ] = args .ramp_up_strategy
950+ result_json ["ramp_up_start_rps" ] = args .ramp_up_start_rps
951+ result_json ["ramp_up_end_rps" ] = args .ramp_up_end_rps
952+
837953 # Merge with benchmark result
838954 result_json = {** result_json , ** benchmark_result }
839955
@@ -859,7 +975,10 @@ def main(args: argparse.Namespace):
859975 if args .max_concurrency is not None
860976 else ""
861977 )
862- file_name = f"{ backend } -{ args .request_rate } qps{ max_concurrency_str } -{ base_model_id } -{ current_dt } .json" # noqa
978+ if args .ramp_up_strategy is not None :
979+ file_name = f"{ backend } -ramp-up-{ args .ramp_up_strategy } -{ args .ramp_up_start_rps } qps-{ args .ramp_up_end_rps } qps{ max_concurrency_str } -{ base_model_id } -{ current_dt } .json" # noqa
980+ else :
981+ file_name = f"{ backend } -{ args .request_rate } qps{ max_concurrency_str } -{ base_model_id } -{ current_dt } .json" # noqa
863982 if args .result_filename :
864983 file_name = args .result_filename
865984 if args .result_dir :
@@ -1225,6 +1344,31 @@ def create_argument_parser():
12251344 "script chooses a LoRA module at random." ,
12261345 )
12271346
1347+ parser .add_argument (
1348+ "--ramp-up-strategy" ,
1349+ type = str ,
1350+ default = None ,
1351+ choices = ["linear" , "exponential" ],
1352+ help = "The ramp-up strategy. This would be used to "
1353+ "ramp up the request rate from initial RPS to final "
1354+ "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
1355+ "over the duration of the benchmark." ,
1356+ )
1357+ parser .add_argument (
1358+ "--ramp-up-start-rps" ,
1359+ type = int ,
1360+ default = None ,
1361+ help = "The starting request rate for ramp-up (RPS). "
1362+ "Needs to be specified when --ramp-up-strategy is used." ,
1363+ )
1364+ parser .add_argument (
1365+ "--ramp-up-end-rps" ,
1366+ type = int ,
1367+ default = None ,
1368+ help = "The ending request rate for ramp-up (RPS). "
1369+ "Needs to be specified when --ramp-up-strategy is used." ,
1370+ )
1371+
12281372 return parser
12291373
12301374
0 commit comments