44import sys
55import datasets
66import pytest
7+ import transformers
78from dataclasses import dataclass
89from pathlib import Path
910from typing import Optional
1011from tqdm import tqdm
11-
12+ from optimum . intel . openvino import OVModelForVisualCausalLM
1213from openvino_genai import ContinuousBatchingPipeline , SchedulerConfig , GenerationConfig , CacheEvictionConfig , AggregationMode
1314
1415from utils .ov_genai_pipelines import PipelineType , generate_and_compare
1516from utils .longbench import dataset2maxlen , evaluate , preprocess_prompt , post_process_pred
17+ from utils .milebench import MileBenchDataset , Eval
1618from utils .constants import get_default_llm_properties
17- from utils .hugging_face import download_and_convert_model
19+ from utils .hugging_face import download_and_convert_model , _download_and_convert_model
20+ from utils .network import retry_request
1821from data .test_dataset import get_test_dataset
1922
2023
@@ -182,7 +185,7 @@ def test_dynamic_memory_allocation(params):
182185
183186
184187@dataclass
185- class LongBenchTestData :
188+ class BenchmarkTestData :
186189 subset : str
187190 threshold : float
188191 max_cache_usage_optimization_ratio : float
@@ -192,9 +195,9 @@ class LongBenchTestData:
192195@pytest .mark .nightly
193196@pytest .mark .parametrize ("device" , ["CPU" , "GPU" ])
194197@pytest .mark .parametrize ("test_struct" , [
195- LongBenchTestData ("samsum" , 4 , 1.6 , 3.3 ),
196- LongBenchTestData ("trec" , 3.2 , 2.0 , 3.3 ),
197- LongBenchTestData ("qasper" , 5.8 , 1.7 , 3.6 ),
198+ BenchmarkTestData ("samsum" , 4 , 1.6 , 3.3 ),
199+ BenchmarkTestData ("trec" , 3.2 , 2.0 , 3.3 ),
200+ BenchmarkTestData ("qasper" , 5.8 , 1.7 , 3.6 ),
198201])
199202def test_optimized_generation_longbench (device , test_struct ):
200203 seqs_per_request = 32
@@ -265,3 +268,102 @@ def test_optimized_generation_longbench(device, test_struct):
265268 assert ref_score - score <= test_struct .threshold
266269 assert max_optimization_ratio >= test_struct .max_cache_usage_optimization_ratio
267270 assert avg_optimization_ratio >= test_struct .avg_cache_usage_optimization_ratio
271+
272+
273+ MILEBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig (start_size = 32 , recent_size = 128 , max_cache_size = 672 , aggregation_mode = AggregationMode .SUM )
274+
275+ @pytest .mark .nightly
276+ @pytest .mark .parametrize ("device" , ["CPU" , "GPU" ])
277+ @pytest .mark .parametrize ("test_struct" , [
278+ BenchmarkTestData ("ALFRED" , 3.2 , 2.0 , 3.3 ),
279+ BenchmarkTestData ("MMCoQA" , 4 , 1.6 , 3.3 ),
280+ BenchmarkTestData ("TextNeedleInAHaystack" , 3.2 , 2.0 , 3.3 ),
281+ BenchmarkTestData ("WikiVQA" , 5.8 , 1.29 , 2.621 ),
282+ ])
283+ def test_optimized_generation_milebench (device , test_struct ):
284+ seqs_per_request = 32
285+ num_kv_blocks = 1000 if device == "CPU" else 500
286+ model_id = "Qwen/Qwen2-VL-2B-Instruct"
287+ _ , _ , models_path = _download_and_convert_model (model_id , OVModelForVisualCausalLM )
288+ scheduler_config = get_scheduler_config (num_kv_blocks )
289+
290+ scheduler_config_opt = get_scheduler_config (num_kv_blocks )
291+ scheduler_config_opt .use_cache_eviction = True
292+ if scheduler_config_opt .use_cache_eviction :
293+ scheduler_config_opt .cache_eviction_config = MILEBENCH_CACHE_EVICTION_CONFIG
294+
295+ model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , device , {}, get_default_llm_properties ())
296+ model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , device , {}, get_default_llm_properties ())
297+
298+ generation_config = GenerationConfig () # expecting default greedy sampling
299+ generation_config .num_return_sequences = 1
300+ generation_config .max_new_tokens = 64
301+
302+ processor = retry_request (
303+ lambda : transformers .AutoProcessor .from_pretrained (
304+ model_id ,
305+ trust_remote_code = True ,
306+ )
307+ )
308+
309+ data_dir = "milebench_data" # HF_HOME / "milebench_data"
310+ subset = test_struct .subset
311+ data = MileBenchDataset (
312+ data_dir = data_dir ,
313+ subset = subset ,
314+ subset_size = seqs_per_request ,
315+ )
316+ with tqdm (total = len (data )) as progress_bar :
317+ prompts , images = [], []
318+ answers = []
319+ ref_answers = []
320+ for p_idx , data_sample in enumerate (data ):
321+ conversation = data_sample ["conversation" ]
322+ prompt = processor .apply_chat_template (
323+ conversation , tokenize = False , add_generation_prompt = True
324+ )
325+ image = data_sample ["images" ]
326+
327+ progress_bar .update (1 )
328+ prompts .append (prompt )
329+ images .append (image )
330+ answers .append ({"gt_answer" : data_sample ["gt_answer" ], "choice_list" : data_sample ["choice_list" ]})
331+ ref_answers .append ({"gt_answer" : data_sample ["gt_answer" ], "choice_list" : data_sample ["choice_list" ]})
332+
333+ if len (prompts ) == seqs_per_request or p_idx == len (data ) - 1 :
334+ ans_batch = model_cb_opt .generate (
335+ prompts , images = images , generation_config = [generation_config ] * len (prompts )
336+ )
337+ ref_ans_batch = model_cb_noopt .generate (
338+ prompts , images = images , generation_config = [generation_config ] * len (prompts )
339+ )
340+ for i , (opt_output , ref_output ) in enumerate (zip (ans_batch , ref_ans_batch ), start = p_idx - len (prompts )+ 1 ):
341+ answers [i ]["pred" ] = opt_output .m_generation_ids [0 ]
342+ ref_answers [i ]["pred" ] = ref_output .m_generation_ids [0 ]
343+ prompts .clear ()
344+ images .clear ()
345+
346+ question_type = data .annotation ['meta_data' ]['question_type' ]
347+ scorer = Eval ()
348+ score = scorer .evaluate (answers , subset , question_type )
349+ print (f"Score: { score } " )
350+
351+ ref_score = scorer .evaluate (ref_answers , subset , question_type )
352+ print (f"Reference score: { ref_score } " )
353+ pipeline_opt_metrics = model_cb_opt .get_metrics ()
354+ pipeline_noopt_metrics = model_cb_noopt .get_metrics ()
355+
356+ print (f"No-opt cache usage: max { pipeline_noopt_metrics .max_cache_usage :.3f} , avg { pipeline_noopt_metrics .avg_cache_usage :.3f} " )
357+ print (f"Opt cache usage: max { pipeline_opt_metrics .max_cache_usage :.3f} , avg { pipeline_opt_metrics .avg_cache_usage :.3f} " )
358+ max_optimization_ratio = (pipeline_noopt_metrics .max_cache_usage / pipeline_opt_metrics .max_cache_usage )
359+ avg_optimization_ratio = (pipeline_noopt_metrics .avg_cache_usage / pipeline_opt_metrics .avg_cache_usage )
360+ print (f"Optimization ratios: max { max_optimization_ratio :.3f} x, avg { avg_optimization_ratio :.3f} x" )
361+
362+ del model_cb_opt
363+ del model_cb_noopt
364+ import gc
365+ gc .collect ()
366+
367+ assert ref_score - score <= test_struct .threshold
368+ assert max_optimization_ratio >= test_struct .max_cache_usage_optimization_ratio
369+ assert avg_optimization_ratio >= test_struct .avg_cache_usage_optimization_ratio
0 commit comments