--author=Kanzhi Cheng <827023266@qq.com>

QianhuiWu · QianhuiWu · commit c9ffbf19ccb5 · 2025-06-07T22:14:51.000-07:00
support qwen2.5VL
diff --git a/README.md b/README.md
@@ -140,15 +140,15 @@ import torch
 
 from qwen_vl_utils import process_vision_info
 from datasets import load_dataset
-from transformers import Qwen2VLProcessor
+from transformers import AutoProcessor
 from gui_actor.constants import chat_template
 from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference
 
 
 # load model
 model_name_or_path = "microsoft/GUI-Actor-7B-Qwen2-VL"
-data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
+data_processor = AutoProcessor.from_pretrained(model_name_or_path)
 tokenizer = data_processor.tokenizer
 model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
     model_name_or_path,
diff --git a/eval/screenSpot.py b/eval/screenSpot.py
@@ -5,13 +5,14 @@
 
 from tqdm import tqdm
 from datasets import load_dataset
-from transformers import Qwen2VLProcessor
+from transformers import AutoProcessor
 
 from gui_actor.constants import chat_template
 from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
+from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference, ForceFollowTokensLogitsProcessor
 from gui_actor.utils import do_boxes_overlap
-from gui_actor.constants import DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN, grounding_system_message
+from gui_actor.constants import DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN
 
 IMAGE_PATCH_SIZE =14
 
@@ -27,19 +28,33 @@ def normalize_bbox(bbox_x1y1x2y2, img_width, img_height):
         y2 = y2 / img_height
         return x1, y1, x2, y2
 
-def evaluate(model_name_or_path, use_placeholder, topk):
+def evaluate(model_name_or_path, model_type, use_placeholder, topk):
     # initialize model
-    data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
+    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
     tokenizer = data_processor.tokenizer
     for k, v in tokenizer.added_tokens_encoder.items():
         print(v, k)
 
-    model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
-        model_name_or_path,
-        torch_dtype=torch.bfloat16,
-        device_map="cuda:0",
-        attn_implementation="flash_attention_2"
-    ).eval()
+    if model_type == "qwen2vl":
+        print(f"Loading model with Qwen2-VL backbone from {model_name_or_path}")
+        model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda:0",
+            attn_implementation="flash_attention_2"
+        ).eval()
+        grounding_system_message = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task."
+    elif model_type == "qwen25vl":
+        print(f"Loading model with Qwen2.5-VL backbone from {model_name_or_path}")
+        model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda:0",
+            attn_implementation="flash_attention_2"
+        ).eval()
+        grounding_system_message = "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>)."
+    else:
+        raise ValueError(f"Invalid model type: {model_type}")
     print(f"Loaded model from {model_name_or_path}")
 
     logits_processor_pointer = ForceFollowTokensLogitsProcessor(
@@ -248,7 +263,8 @@ def format_cell(cell):
 """
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name_or_path", type=str, default="microsoft/GUI-Actor-2B-Qwen2-VL")
+    parser.add_argument("--model_type", type=str, default="qwen25vl", choices=["qwen2vl", "qwen25vl"])
+    parser.add_argument("--model_name_or_path", type=str, default="qianhuiwu/GUI-Actor-3B-Qwen-2.5-VL")
     parser.add_argument("--save_path", type=str, default="./")
     parser.add_argument('--topk', type=int, default=3, help='Topk')
     parser.add_argument('--no-placeholder', dest='use_placeholder', action='store_false', help='Disable the placeholder')
@@ -271,7 +287,7 @@ def format_cell(cell):
             results = json.load(f)
     else:
         print(f"Evaluating {args.model_name_or_path}...")
-        results = evaluate(args.model_name_or_path, args.use_placeholder, args.topk)
+        results = evaluate(args.model_name_or_path, args.model_type, args.use_placeholder, args.topk)
         with open(pred_path, "w") as f:
             json.dump(results, f)
         print(f"Saved {len(results)} predictions to {pred_path}")
diff --git a/eval/screenSpot_pro.py b/eval/screenSpot_pro.py
@@ -5,13 +5,14 @@
 
 from tqdm import tqdm
 from datasets import load_dataset
-from transformers import Qwen2VLProcessor
+from transformers import AutoProcessor
 from PIL import Image
 from gui_actor.constants import chat_template
 from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
+from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference, ForceFollowTokensLogitsProcessor
 from gui_actor.utils import do_boxes_overlap
-from gui_actor.constants import DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN, grounding_system_message
+from gui_actor.constants import DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN
 
 IMAGE_PATCH_SIZE =14
 
@@ -27,19 +28,33 @@ def normalize_bbox(bbox_x1y1x2y2, img_width, img_height):
         y2 = y2 / img_height
         return x1, y1, x2, y2
 
-def evaluate(model_name_or_path, data_fn, image_dir, use_placeholder, topk, resize_to_pixels=None):
+def evaluate(model_name_or_path, model_type, data_fn, image_dir, use_placeholder, topk, resize_to_pixels=None):
     # initialize model
-    data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
+    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
     tokenizer = data_processor.tokenizer
     for k, v in tokenizer.added_tokens_encoder.items():
         print(v, k)
 
-    model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
-        model_name_or_path,
-        torch_dtype=torch.bfloat16,
-        device_map="cuda:0",
-        attn_implementation="flash_attention_2"
-    ).eval()
+    if model_type == "qwen2vl":
+        print(f"Loading model with Qwen2-VL backbone from {model_name_or_path}")
+        model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda:0",
+            attn_implementation="flash_attention_2"
+        ).eval()
+        grounding_system_message = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task."
+    elif model_type == "qwen25vl":
+        print(f"Loading model with Qwen2.5-VL backbone from {model_name_or_path}")
+        model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda:0",
+            attn_implementation="flash_attention_2"
+        ).eval()
+        grounding_system_message = "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>)."
+    else:
+        raise ValueError(f"Invalid model type: {model_type}")
     print(f"Loaded model from {model_name_or_path}")
 
     logits_processor_pointer = ForceFollowTokensLogitsProcessor(
@@ -137,6 +152,8 @@ def evaluate(model_name_or_path, data_fn, image_dir, use_placeholder, topk, resi
         results.append(ele)
     
     return results
+
+
 def get_metric(list_of_examples, 
                groups=["Dev", "Creative", "CAD", "Scientific", "Office", "OS"],
                ui_types=["text", "icon"]):
@@ -247,13 +264,15 @@ def format_cell(cell):
     print(metric_info)
     return metric_info
 
+
 """
 # cd to project root directory
 python eval/screenSpot_pro.py --save_path <path_to_save_results> --data_path <path_to_data>
 """
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name_or_path", type=str, default="microsoft/GUI-Actor-2B-Qwen2-VL")
+    parser.add_argument("--model_type", type=str, default="qwen25vl", choices=["qwen2vl", "qwen25vl"])
+    parser.add_argument("--model_name_or_path", type=str, default="microsoft/GUI-Actor-7B-Qwen2.5-VL")
     parser.add_argument("--save_path", type=str, default="./")
     parser.add_argument("--data_path", type=str, default="/mnt/data/ScreenSpot-Pro")
     parser.add_argument("--resize_to_pixels", type=int, default=3200*1800, help="If set to <0, will not resize the image.")
@@ -281,7 +300,7 @@ def format_cell(cell):
             results = json.load(f)
     else:
         print(f"Evaluating {args.model_name_or_path}...")
-        results = evaluate(args.model_name_or_path, data_fn, image_dir, args.use_placeholder, args.topk, resize_to_pixels)
+        results = evaluate(args.model_name_or_path, args.model_type, data_fn, image_dir, args.use_placeholder, args.topk, resize_to_pixels)
         with open(pred_path, "w") as f:
             json.dump(results, f)
         print(f"Saved {len(results)} predictions to {pred_path}")
diff --git a/eval/screenSpot_v2.py b/eval/screenSpot_v2.py
@@ -5,13 +5,14 @@
 
 from tqdm import tqdm
 from datasets import load_dataset
-from transformers import Qwen2VLProcessor
+from transformers import AutoProcessor
 
 from gui_actor.constants import chat_template
 from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
+from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference, ForceFollowTokensLogitsProcessor
 from gui_actor.utils import do_boxes_overlap
-from gui_actor.constants import DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN, grounding_system_message
+from gui_actor.constants import DEFAULT_POINTER_PAD_TOKEN, DEFAULT_POINTER_END_TOKEN
 
 IMAGE_PATCH_SIZE =14
 
@@ -27,19 +28,33 @@ def normalize_bbox(bbox_x1y1x2y2, img_width, img_height):
         y2 = y2 / img_height
         return x1, y1, x2, y2
 
-def evaluate(model_name_or_path, use_placeholder, topk):
+def evaluate(model_name_or_path, model_type, use_placeholder, topk):
     # initialize model
-    data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
+    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
     tokenizer = data_processor.tokenizer
     for k, v in tokenizer.added_tokens_encoder.items():
         print(v, k)
 
-    model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
-        model_name_or_path,
-        torch_dtype=torch.bfloat16,
-        device_map="cuda:0",
-        attn_implementation="flash_attention_2"
-    ).eval()
+    if model_type == "qwen2vl":
+        print(f"Loading model with Qwen2-VL backbone from {model_name_or_path}")
+        model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda:0",
+            attn_implementation="flash_attention_2"
+        ).eval()
+        grounding_system_message = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task."
+    elif model_type == "qwen25vl":
+        print(f"Loading model with Qwen2.5-VL backbone from {model_name_or_path}")
+        model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda:0",
+            attn_implementation="flash_attention_2"
+        ).eval()
+        grounding_system_message = "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>)."
+    else:
+        raise ValueError(f"Invalid model type: {model_type}")
     print(f"Loaded model from {model_name_or_path}")
 
     logits_processor_pointer = ForceFollowTokensLogitsProcessor(
@@ -248,6 +263,7 @@ def format_cell(cell):
 """
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--model_type", type=str, default="qwen2vl", choices=["qwen2vl", "qwen25vl"])
     parser.add_argument("--model_name_or_path", type=str, default="microsoft/GUI-Actor-2B-Qwen2-VL")
     parser.add_argument("--save_path", type=str, default="./")
     parser.add_argument('--topk', type=int, default=3, help='Topk')
@@ -271,7 +287,7 @@ def format_cell(cell):
             results = json.load(f)
     else:
         print(f"Evaluating {args.model_name_or_path}...")
-        results = evaluate(args.model_name_or_path, args.use_placeholder, args.topk)
+        results = evaluate(args.model_name_or_path, args.model_type, args.use_placeholder, args.topk)
         with open(pred_path, "w") as f:
             json.dump(results, f)
         print(f"Saved {len(results)} predictions to {pred_path}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,9 +14,10 @@ dependencies = [
     "accelerate==1.1.1",
     "qwen-vl-utils==0.0.8",
     "deepspeed==0.16.0",
-    "transformers==4.50.0",
+    "transformers==4.51.3",
     "flash-attn",
-    "wandb==0.18.3"
+    "wandb==0.18.3",
+    "datasets>=2.18.0"
 ]
 requires-python = ">=3.10,<3.13"
 readme = "README.md"
diff --git a/scripts/train.sh b/scripts/train.sh
@@ -1,12 +1,15 @@
 #!/bin/bash
-llm_model="./checkpoints/qwen2vl_warmup"
-output_dir="./checkpoints/qwen2vl_sft"
+# model_type: qwen2vl or qwen25vl
+model_type="qwen2vl"
+llm_model="./checkpoints/${model_type}_warmup"
+output_dir="./checkpoints/${model_type}_sft"
 
 # === Training Command ===
 torchrun --nproc_per_node=4 train.py \
   --deepspeed ./scripts/zero3.json \
   --data_path data/data_config.yaml \
   --image_folder "" \
+  --model_type ${model_type} \
   --model_name_or_path ${llm_model} \
   --group_by_modality_length True \
   --bf16 True \
diff --git a/scripts/warmup.sh b/scripts/warmup.sh
@@ -1,12 +1,15 @@
 #!/bin/bash
-llm_model="Qwen/Qwen2-VL-2B-Instruct"
-output_dir="./checkpoints/qwen2vl_warmup"
+# model_type: qwen2vl or qwen25vl
+model_type="qwen25vl"
+llm_model="Qwen/Qwen2.5-VL-3B-Instruct"
+output_dir="./checkpoints/${model_type}_warmup"
 
 # === Training Command ===
 torchrun --nproc_per_node=4 train.py \
   --deepspeed ./scripts/zero3.json \
   --data_path data/data_config.yaml \
   --image_folder "" \
+  --model_type ${model_type} \
   --model_name_or_path ${llm_model} \
   --group_by_modality_length True \
   --bf16 True \
diff --git a/src/gui_actor/constants.py b/src/gui_actor/constants.py
@@ -15,7 +15,7 @@
 # UNMASK_TOKEN_IDS = [198, 151644, 151645]
 
 # System Message
-grounding_system_message = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task."
+grounding_system_message = "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>)."
 
 # Chat Template
 chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
diff --git a/src/gui_actor/modeling.py b/src/gui_actor/modeling.py
@@ -166,7 +166,7 @@ def forward(self,
         if inputs_embeds is None:
             inputs_embeds = self.model.embed_tokens(input_ids) # shape: (batch_size, seq_len, d_model)
             if pixel_values is not None:
-                pixel_values = pixel_values.type(self.visual.get_dtype())
+                pixel_values = pixel_values.type(self.visual.dtype)
                 image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                 n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                 n_image_features = image_embeds.shape[0]
@@ -184,7 +184,7 @@ def forward(self,
                 inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
 
             if pixel_values_videos is not None:
-                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
+                pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
                 video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                 n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                 n_video_features = video_embeds.shape[0]
diff --git a/src/gui_actor/modeling_qwen25vl.py b/src/gui_actor/modeling_qwen25vl.py
diff --git a/train.py b/train.py