Fix accuracy issue

l-bat · l-bat · commit db5b244c6529 · 2025-06-24T10:41:29.000+01:00
diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
@@ -270,15 +270,14 @@ def test_optimized_generation_longbench(device, test_struct):
     assert avg_optimization_ratio >= test_struct.avg_cache_usage_optimization_ratio
 
 
-MILEBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=128, max_cache_size=672, aggregation_mode=AggregationMode.SUM)
+MILEBENCH_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=64, max_cache_size=352, aggregation_mode=AggregationMode.SUM)
 
 @pytest.mark.nightly
 @pytest.mark.parametrize("device", ["CPU", "GPU"])
 @pytest.mark.parametrize("test_struct", [
-    BenchmarkTestData("ALFRED", 3.2, 2.0, 3.3),
-    BenchmarkTestData("MMCoQA", 4, 1.6, 3.3),
-    BenchmarkTestData("TextNeedleInAHaystack", 3.2, 2.0, 3.3),
-    BenchmarkTestData("WikiVQA", 5.8, 1.29, 2.621),
+    BenchmarkTestData("ALFRED", 0.011, 1.440, 1.574),
+    BenchmarkTestData("MMCoQA", 0.032, 1.843, 1.620),
+    BenchmarkTestData("WikiVQA", 0.032, 1.412, 1.527),
 ])
 def test_optimized_generation_milebench(device, test_struct):
     seqs_per_request = 32
@@ -292,19 +291,13 @@ def test_optimized_generation_milebench(device, test_struct):
     if scheduler_config_opt.use_cache_eviction:
         scheduler_config_opt.cache_eviction_config = MILEBENCH_CACHE_EVICTION_CONFIG
 
-    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_llm_properties())
-    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_llm_properties())
+    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, properties=get_default_llm_properties())
+    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, properties=get_default_llm_properties())
 
     generation_config = GenerationConfig()  # expecting default greedy sampling
     generation_config.num_return_sequences = 1
-    generation_config.max_new_tokens = 64
-
-    processor = retry_request(
-        lambda: transformers.AutoProcessor.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-        )
-    )
+    generation_config.max_new_tokens = 512
+    generation_config.do_sample = False
 
     data_dir = "milebench_data" # HF_HOME / "milebench_data"
     subset = test_struct.subset
@@ -313,15 +306,13 @@ def test_optimized_generation_milebench(device, test_struct):
         subset=subset,
         subset_size=seqs_per_request,
     )
+
     with tqdm(total=len(data)) as progress_bar:
         prompts, images = [], []
         answers = []
         ref_answers = []
         for p_idx, data_sample in enumerate(data):
-            conversation = data_sample["conversation"]
-            prompt = processor.apply_chat_template(
-                conversation, tokenize=False, add_generation_prompt=True
-            )
+            prompt = data_sample["prompt"]
             image = data_sample["images"]
 
             progress_bar.update(1)
@@ -332,24 +323,27 @@ def test_optimized_generation_milebench(device, test_struct):
 
             if len(prompts) == seqs_per_request or p_idx == len(data) - 1:
                 ans_batch = model_cb_opt.generate(
-                    prompts, images=images, generation_config=[generation_config] * len(prompts)
+                    prompts, images=images, generation_config=[generation_config] * len(prompts),
                 )
                 ref_ans_batch = model_cb_noopt.generate(
-                    prompts, images=images, generation_config=[generation_config] * len(prompts)
+                    prompts, images=images, generation_config=[generation_config] * len(prompts),
                 )
+
                 for i, (opt_output, ref_output) in enumerate(zip(ans_batch, ref_ans_batch), start=p_idx-len(prompts)+1):
-                    answers[i]["pred"] = opt_output.m_generation_ids[0]
-                    ref_answers[i]["pred"] = ref_output.m_generation_ids[0]
+                    answers[i]["pred"] = opt_output.texts[0]
+                    ref_answers[i]["pred"] = ref_output.texts[0]
                 prompts.clear()
                 images.clear()
 
     question_type = data.annotation['meta_data']['question_type']
     scorer = Eval()
+
     score = scorer.evaluate(answers, subset, question_type)
     print(f"Score: {score}")
 
     ref_score = scorer.evaluate(ref_answers, subset, question_type)
     print(f"Reference score: {ref_score}")
+
     pipeline_opt_metrics = model_cb_opt.get_metrics()
     pipeline_noopt_metrics = model_cb_noopt.get_metrics()
 
diff --git a/tests/python_tests/utils/milebench.py b/tests/python_tests/utils/milebench.py
@@ -6,6 +6,49 @@
 #
 # Licensed under the Apache License
 
+# To download the required subsets from the MileBench dataset, please run the following script:
+#
+#!/bin/bash
+# OUT_DIR="milebench_data"
+# KEEP_DIRS=("ALFRED" "MMCoQA" "WikiVQA")
+# BASE_URL="https://huggingface.co/datasets/FreedomIntelligence/MileBench/resolve/main"
+# # List of tar.gz parts to download
+# PARTS=(part0 part2 part5)
+# # Create output directory
+# mkdir -p "$OUT_DIR"
+# cd "$OUT_DIR" || exit 1
+# # Download and extract
+# for part in "${PARTS[@]}"; do
+#     FILENAME="MileBench_${part}.tar.gz"
+#     URL="${BASE_URL}/${FILENAME}"
+#     echo "Downloading $FILENAME..."
+#     curl -L -o "$FILENAME" "$URL"
+#     echo "Extracting $FILENAME..."
+#     tar -xzf "$FILENAME" || { echo "Failed to extract $FILENAME"; exit 1; }
+#     rm "$FILENAME"
+# done
+# # Remove unwanted folders
+# echo "Cleaning up..."
+# for dir in */ ; do
+#     dir=${dir%/}
+#     if [[ ! " ${KEEP_DIRS[@]} " =~ " ${dir} " ]]; then
+#         echo "Removing $dir"
+#         rm -rf "$dir"
+#     fi
+# done
+# echo "Removing combined_1_images folders and *-adv.json inside kept directories..."
+# for dir in "${KEEP_DIRS[@]}"; do
+#     TARGET="$dir/combined_1_images"
+#     if [ -d "$TARGET" ]; then
+#         rm -rf "$TARGET"
+#     fi
+#     ADV_FILE="$dir/${dir}-adv.json"
+#     if [ -f "$ADV_FILE" ]; then
+#         rm "$ADV_FILE"
+#     fi
+# done
+
+
 import os
 import json
 import re
@@ -67,25 +110,17 @@ def __getitem__(self, idx):
             context += choice_str
 
         img_num = len(ann["task_instance"]["images_path"])
+        qwen2_vl_image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
         for i in range(img_num):
             rmv_txt = "{image#%d}"% (i+1)
             rmv_tbl = "{table#%d}"% (i+1)
-            context = context.replace(rmv_txt, "")
-            context = context.replace(rmv_tbl, "")
+            context = context.replace(rmv_txt, qwen2_vl_image_placeholder)
+            context = context.replace(rmv_tbl, qwen2_vl_image_placeholder)
 
         task_instruction_id = ann["task_instruction_id"]
         context_str = task_instructions[task_instruction_id] + "\n" + context
         prompt = MileBenchDataset._transform_string(context_str)
 
-        conversation = [
-            {
-                "role": "user",
-                "content": [{"type": "text", "text": prompt}],
-            },
-        ]
-        for i in range(img_num):
-            conversation[0]["content"].append({"type": "image"})
-
         images = []
         for p in ann["task_instance"]["images_path"]:
             img_path = os.path.join(self.image_dir, p)
@@ -95,7 +130,7 @@ def __getitem__(self, idx):
             images.append(image_tensor)
 
         return {
-            "conversation": conversation,
+            "prompt": prompt,
             "images": images,
             "gt_answer": ann["response"],
             "choice_list": ann["task_instance"].get("choice_list", None),