@@ -270,15 +270,14 @@ def test_optimized_generation_longbench(device, test_struct):
270270 assert avg_optimization_ratio >= test_struct .avg_cache_usage_optimization_ratio
271271
272272
273- MILEBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig (start_size = 32 , recent_size = 128 , max_cache_size = 672 , aggregation_mode = AggregationMode .SUM )
273+ MILEBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig (start_size = 32 , recent_size = 64 , max_cache_size = 352 , aggregation_mode = AggregationMode .SUM )
274274
275275@pytest .mark .nightly
276276@pytest .mark .parametrize ("device" , ["CPU" , "GPU" ])
277277@pytest .mark .parametrize ("test_struct" , [
278- BenchmarkTestData ("ALFRED" , 3.2 , 2.0 , 3.3 ),
279- BenchmarkTestData ("MMCoQA" , 4 , 1.6 , 3.3 ),
280- BenchmarkTestData ("TextNeedleInAHaystack" , 3.2 , 2.0 , 3.3 ),
281- BenchmarkTestData ("WikiVQA" , 5.8 , 1.29 , 2.621 ),
278+ BenchmarkTestData ("ALFRED" , 0.011 , 1.440 , 1.574 ),
279+ BenchmarkTestData ("MMCoQA" , 0.032 , 1.843 , 1.620 ),
280+ BenchmarkTestData ("WikiVQA" , 0.032 , 1.412 , 1.527 ),
282281])
283282def test_optimized_generation_milebench (device , test_struct ):
284283 seqs_per_request = 32
@@ -292,19 +291,13 @@ def test_optimized_generation_milebench(device, test_struct):
292291 if scheduler_config_opt .use_cache_eviction :
293292 scheduler_config_opt .cache_eviction_config = MILEBENCH_CACHE_EVICTION_CONFIG
294293
295- model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , device , {}, get_default_llm_properties ())
296- model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , device , {}, get_default_llm_properties ())
294+ model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , device , properties = get_default_llm_properties ())
295+ model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , device , properties = get_default_llm_properties ())
297296
298297 generation_config = GenerationConfig () # expecting default greedy sampling
299298 generation_config .num_return_sequences = 1
300- generation_config .max_new_tokens = 64
301-
302- processor = retry_request (
303- lambda : transformers .AutoProcessor .from_pretrained (
304- model_id ,
305- trust_remote_code = True ,
306- )
307- )
299+ generation_config .max_new_tokens = 512
300+ generation_config .do_sample = False
308301
309302 data_dir = "milebench_data" # HF_HOME / "milebench_data"
310303 subset = test_struct .subset
@@ -313,15 +306,13 @@ def test_optimized_generation_milebench(device, test_struct):
313306 subset = subset ,
314307 subset_size = seqs_per_request ,
315308 )
309+
316310 with tqdm (total = len (data )) as progress_bar :
317311 prompts , images = [], []
318312 answers = []
319313 ref_answers = []
320314 for p_idx , data_sample in enumerate (data ):
321- conversation = data_sample ["conversation" ]
322- prompt = processor .apply_chat_template (
323- conversation , tokenize = False , add_generation_prompt = True
324- )
315+ prompt = data_sample ["prompt" ]
325316 image = data_sample ["images" ]
326317
327318 progress_bar .update (1 )
@@ -332,24 +323,27 @@ def test_optimized_generation_milebench(device, test_struct):
332323
333324 if len (prompts ) == seqs_per_request or p_idx == len (data ) - 1 :
334325 ans_batch = model_cb_opt .generate (
335- prompts , images = images , generation_config = [generation_config ] * len (prompts )
326+ prompts , images = images , generation_config = [generation_config ] * len (prompts ),
336327 )
337328 ref_ans_batch = model_cb_noopt .generate (
338- prompts , images = images , generation_config = [generation_config ] * len (prompts )
329+ prompts , images = images , generation_config = [generation_config ] * len (prompts ),
339330 )
331+
340332 for i , (opt_output , ref_output ) in enumerate (zip (ans_batch , ref_ans_batch ), start = p_idx - len (prompts )+ 1 ):
341- answers [i ]["pred" ] = opt_output .m_generation_ids [0 ]
342- ref_answers [i ]["pred" ] = ref_output .m_generation_ids [0 ]
333+ answers [i ]["pred" ] = opt_output .texts [0 ]
334+ ref_answers [i ]["pred" ] = ref_output .texts [0 ]
343335 prompts .clear ()
344336 images .clear ()
345337
346338 question_type = data .annotation ['meta_data' ]['question_type' ]
347339 scorer = Eval ()
340+
348341 score = scorer .evaluate (answers , subset , question_type )
349342 print (f"Score: { score } " )
350343
351344 ref_score = scorer .evaluate (ref_answers , subset , question_type )
352345 print (f"Reference score: { ref_score } " )
346+
353347 pipeline_opt_metrics = model_cb_opt .get_metrics ()
354348 pipeline_noopt_metrics = model_cb_noopt .get_metrics ()
355349
0 commit comments