@@ -268,36 +268,41 @@ def test_optimized_generation_longbench(test_struct):
268268 assert avg_optimization_ratio >= test_struct .avg_cache_usage_optimization_ratio
269269
270270
271- MILEBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig (start_size = 32 , recent_size = 64 , max_cache_size = 352 , aggregation_mode = AggregationMode .SUM )
272-
273- @pytest .mark .nightly
274- @pytest .mark .parametrize ("device" , ["CPU" , "GPU" ])
271+ @pytest .mark .precommit
275272@pytest .mark .parametrize (
276273 ("test_struct" , "download_test_content" ), [
277- (BenchmarkTestData ("ALFRED" , 0.011 , 1.440 , 1.574 ), "MileBench_part0.tar.gz" ),
278- (BenchmarkTestData ("MMCoQA" , 0.032 , 1.843 , 1.620 ), "MileBench_part2.tar.gz" ),
279- (BenchmarkTestData ("WikiVQA" , 0.032 , 1.412 , 1.527 ), "MileBench_part5.tar.gz" ),
274+ (BenchmarkTestData ("ALFRED" , 0.006 , 2.10 , 2.33 ), "MileBench_part0.tar.gz" ),
275+ (BenchmarkTestData ("MMCoQA" , 0.001 , 1.91 , 1.73 ), "MileBench_part2.tar.gz" ),
276+ (BenchmarkTestData ("WikiVQA" , 0.001 , 1.41 , 1.47 ), "MileBench_part5.tar.gz" ),
280277 ],
281278 indirect = ["download_test_content" ]
282279)
283- def test_optimized_generation_milebench (device , test_struct , download_test_content ):
284- seqs_per_request = 32
285- num_kv_blocks = 1000 if device == "CPU" else 500
280+ def test_optimized_generation_milebench (test_struct , download_test_content ):
281+ seqs_per_request = 16
282+ device = "CPU"
283+ num_kv_blocks = 500
286284 model_id = "Qwen/Qwen2-VL-2B-Instruct"
287285 _ , _ , models_path = _download_and_convert_model (model_id , OVModelForVisualCausalLM )
288286 scheduler_config = get_scheduler_config (num_kv_blocks )
289287
290288 scheduler_config_opt = get_scheduler_config (num_kv_blocks )
291289 scheduler_config_opt .use_cache_eviction = True
292290 if scheduler_config_opt .use_cache_eviction :
293- scheduler_config_opt .cache_eviction_config = MILEBENCH_CACHE_EVICTION_CONFIG
291+ eviction_config = CacheEvictionConfig (
292+ start_size = 32 ,
293+ recent_size = 64 ,
294+ max_cache_size = 224 ,
295+ aggregation_mode = AggregationMode .SUM ,
296+ snapkv_window_size = 8 ,
297+ )
298+ scheduler_config_opt .cache_eviction_config = eviction_config
294299
295300 model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , device , properties = get_default_llm_properties ())
296301 model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , device , properties = get_default_llm_properties ())
297302
298303 generation_config = GenerationConfig () # expecting default greedy sampling
299304 generation_config .num_return_sequences = 1
300- generation_config .max_new_tokens = 512
305+ generation_config .max_new_tokens = 64 # change to 512 for full evaluation
301306 generation_config .do_sample = False
302307
303308 subset = test_struct .subset
0 commit comments