Skip to content

Commit 9d22e73

Browse files
committed
Add MileBench validation for VLMs
1 parent 0e3678c commit 9d22e73

File tree

2 files changed

+390
-6
lines changed

2 files changed

+390
-6
lines changed

tests/python_tests/test_kv_cache_eviction.py

Lines changed: 108 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,20 @@
44
import sys
55
import datasets
66
import pytest
7+
import transformers
78
from dataclasses import dataclass
89
from pathlib import Path
910
from typing import Optional
1011
from tqdm import tqdm
11-
12+
from optimum.intel.openvino import OVModelForVisualCausalLM
1213
from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationConfig, CacheEvictionConfig, AggregationMode
1314

1415
from utils.ov_genai_pipelines import PipelineType, generate_and_compare
1516
from utils.longbench import dataset2maxlen, evaluate, preprocess_prompt, post_process_pred
17+
from utils.milebench import MileBenchDataset, Eval
1618
from utils.constants import get_default_llm_properties
17-
from utils.hugging_face import download_and_convert_model
19+
from utils.hugging_face import download_and_convert_model, _download_and_convert_model
20+
from utils.network import retry_request
1821
from data.test_dataset import get_test_dataset
1922

2023

@@ -182,7 +185,7 @@ def test_dynamic_memory_allocation(params):
182185

183186

184187
@dataclass
185-
class LongBenchTestData:
188+
class BenchmarkTestData:
186189
subset: str
187190
threshold: float
188191
max_cache_usage_optimization_ratio: float
@@ -192,9 +195,9 @@ class LongBenchTestData:
192195
@pytest.mark.nightly
193196
@pytest.mark.parametrize("device", ["CPU", "GPU"])
194197
@pytest.mark.parametrize("test_struct", [
195-
LongBenchTestData("samsum", 4, 1.6, 3.3),
196-
LongBenchTestData("trec", 3.2, 2.0, 3.3),
197-
LongBenchTestData("qasper", 5.8, 1.7, 3.6),
198+
BenchmarkTestData("samsum", 4, 1.6, 3.3),
199+
BenchmarkTestData("trec", 3.2, 2.0, 3.3),
200+
BenchmarkTestData("qasper", 5.8, 1.7, 3.6),
198201
])
199202
def test_optimized_generation_longbench(device, test_struct):
200203
seqs_per_request = 32
@@ -265,3 +268,102 @@ def test_optimized_generation_longbench(device, test_struct):
265268
assert ref_score - score <= test_struct.threshold
266269
assert max_optimization_ratio >= test_struct.max_cache_usage_optimization_ratio
267270
assert avg_optimization_ratio >= test_struct.avg_cache_usage_optimization_ratio
271+
272+
273+
MILEBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=128, max_cache_size=672, aggregation_mode=AggregationMode.SUM)
274+
275+
@pytest.mark.nightly
276+
@pytest.mark.parametrize("device", ["CPU", "GPU"])
277+
@pytest.mark.parametrize("test_struct", [
278+
BenchmarkTestData("ALFRED", 3.2, 2.0, 3.3),
279+
BenchmarkTestData("MMCoQA", 4, 1.6, 3.3),
280+
BenchmarkTestData("TextNeedleInAHaystack", 3.2, 2.0, 3.3),
281+
BenchmarkTestData("WikiVQA", 5.8, 1.29, 2.621),
282+
])
283+
def test_optimized_generation_milebench(device, test_struct):
284+
seqs_per_request = 32
285+
num_kv_blocks = 1000 if device == "CPU" else 500
286+
model_id = "Qwen/Qwen2-VL-2B-Instruct"
287+
_, _, models_path = _download_and_convert_model(model_id, OVModelForVisualCausalLM)
288+
scheduler_config = get_scheduler_config(num_kv_blocks)
289+
290+
scheduler_config_opt = get_scheduler_config(num_kv_blocks)
291+
scheduler_config_opt.use_cache_eviction = True
292+
if scheduler_config_opt.use_cache_eviction:
293+
scheduler_config_opt.cache_eviction_config = MILEBENCH_CACHE_EVICTION_CONFIG
294+
295+
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_llm_properties())
296+
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_llm_properties())
297+
298+
generation_config = GenerationConfig() # expecting default greedy sampling
299+
generation_config.num_return_sequences = 1
300+
generation_config.max_new_tokens = 64
301+
302+
processor = retry_request(
303+
lambda: transformers.AutoProcessor.from_pretrained(
304+
model_id,
305+
trust_remote_code=True,
306+
)
307+
)
308+
309+
data_dir = "milebench_data" # HF_HOME / "milebench_data"
310+
subset = test_struct.subset
311+
data = MileBenchDataset(
312+
data_dir=data_dir,
313+
subset=subset,
314+
subset_size=seqs_per_request,
315+
)
316+
with tqdm(total=len(data)) as progress_bar:
317+
prompts, images = [], []
318+
answers = []
319+
ref_answers = []
320+
for p_idx, data_sample in enumerate(data):
321+
conversation = data_sample["conversation"]
322+
prompt = processor.apply_chat_template(
323+
conversation, tokenize=False, add_generation_prompt=True
324+
)
325+
image = data_sample["images"]
326+
327+
progress_bar.update(1)
328+
prompts.append(prompt)
329+
images.append(image)
330+
answers.append({"gt_answer": data_sample["gt_answer"], "choice_list": data_sample["choice_list"]})
331+
ref_answers.append({"gt_answer": data_sample["gt_answer"], "choice_list": data_sample["choice_list"]})
332+
333+
if len(prompts) == seqs_per_request or p_idx == len(data) - 1:
334+
ans_batch = model_cb_opt.generate(
335+
prompts, images=images, generation_config=[generation_config] * len(prompts)
336+
)
337+
ref_ans_batch = model_cb_noopt.generate(
338+
prompts, images=images, generation_config=[generation_config] * len(prompts)
339+
)
340+
for i, (opt_output, ref_output) in enumerate(zip(ans_batch, ref_ans_batch), start=p_idx-len(prompts)+1):
341+
answers[i]["pred"] = opt_output.m_generation_ids[0]
342+
ref_answers[i]["pred"] = ref_output.m_generation_ids[0]
343+
prompts.clear()
344+
images.clear()
345+
346+
question_type = data.annotation['meta_data']['question_type']
347+
scorer = Eval()
348+
score = scorer.evaluate(answers, subset, question_type)
349+
print(f"Score: {score}")
350+
351+
ref_score = scorer.evaluate(ref_answers, subset, question_type)
352+
print(f"Reference score: {ref_score}")
353+
pipeline_opt_metrics = model_cb_opt.get_metrics()
354+
pipeline_noopt_metrics = model_cb_noopt.get_metrics()
355+
356+
print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
357+
print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}")
358+
max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage)
359+
avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage)
360+
print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x")
361+
362+
del model_cb_opt
363+
del model_cb_noopt
364+
import gc
365+
gc.collect()
366+
367+
assert ref_score - score <= test_struct.threshold
368+
assert max_optimization_ratio >= test_struct.max_cache_usage_optimization_ratio
369+
assert avg_optimization_ratio >= test_struct.avg_cache_usage_optimization_ratio

0 commit comments

Comments
 (0)