From 3eaaa57ca7ec8424dab4ba30c1378ed043d80dd5 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 26 Sep 2024 13:57:30 +0800
Subject: [PATCH 01/33] add VLM examples

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Llava/README.md   |   79 +
 .../quantization/auto_round/Llava/main.py     |  385 ++++
 .../Llava/mm_evaluation/__init__.py           |    1 +
 .../auto_round/Llava/mm_evaluation/textvqa.py |  201 ++
 .../auto_round/Llava/run_autoround.sh         |   20 +
 .../auto_round/Phi-3-vision/README.md         |  115 ++
 .../auto_round/Phi-3-vision/eval/__init__.py  |    0
 .../Phi-3-vision/eval/evaluation.py           |  280 +++
 .../auto_round/Phi-3-vision/main.py           |  430 +++++
 .../auto_round/Phi-3-vision/model/__init__.py |    4 +
 .../model/configuration_phi3_v.py             |  217 +++
 .../model/image_embedding_phi3_v.py           |  313 ++++
 .../model/image_processing_phi3_v.py          |  274 +++
 .../Phi-3-vision/model/modeling_phi3_v.py     | 1634 +++++++++++++++++
 .../Phi-3-vision/model/processing_phi3_v.py   |  296 +++
 .../auto_round/Phi-3-vision/requirements.txt  |   18 +
 .../auto_round/Phi-3-vision/run_autoround.sh  |   14 +
 .../quantization/auto_round/Qwen-VL/README.md |  178 ++
 .../quantization/auto_round/Qwen-VL/main.py   |  526 ++++++
 .../Qwen-VL/mm_evaluation/__init__.py         |    4 +
 .../mm_evaluation/evaluate_multiple_choice.py |  216 +++
 .../Qwen-VL/mm_evaluation/evaluate_vqa.py     |  464 +++++
 .../auto_round/Qwen-VL/mm_evaluation/main.py  |  101 +
 .../auto_round/Qwen-VL/mm_evaluation/vqa.py   |  206 +++
 .../Qwen-VL/mm_evaluation/vqa_eval.py         |  330 ++++
 .../auto_round/Qwen-VL/run_autoround.sh       |   21 +
 .../auto_round/Qwen-VL/run_eval.sh            |   15 +
 .../quantization/auto_round/__init__.py       |    0
 .../quantization/auto_round/requirements.txt  |   14 +
 29 files changed, 6356 insertions(+)
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/__init__.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/textvqa.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/__init__.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/evaluation.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/__init__.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/configuration_phi3_v.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_embedding_phi3_v.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_processing_phi3_v.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/modeling_phi3_v.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/processing_phi3_v.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/__init__.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_vqa.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/main.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa_eval.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_eval.sh
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/__init__.py
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
new file mode 100644
index 00000000000..367c3b96700
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
@@ -0,0 +1,79 @@
+
+Step-by-Step
+============
+This document describes the step-by-step instructions to run [VLM quantization for Llava](https://huggingface.co/liuhaotian/llava-v1.5-7b) using AutoRound Quantization.
+
+# Run Quantization on Multimodal Models
+
+In this example, we introduce an straight-forward way to execute quantization on some popular multimodal models such as LLaVA. 
+
+Please note that LLAVA quantized model is currently only support inference with **auto_round** format.
+
+## Install
+If you are not using Linux, do NOT proceed, see instructions for [macOS](https://github.com/haotian-liu/LLaVA/blob/main/docs/macOS.md) and [Windows](https://github.com/haotian-liu/LLaVA/blob/main/docs/Windows.md).
+
+1. Clone this repository and navigate to LLaVA folder
+```shell
+git clone https://github.com/haotian-liu/LLaVA.git
+cd LLaVA
+```
+
+2. Install Package
+```
+pip install --upgrade pip  # enable PEP 660 support
+pip install -e .
+```
+
+## Download the calibration data
+
+Our calibration process resembles the official visual instruction tuning process. To align the official implementation of [LLaVA](https://github.com/haotian-liu/LLaVA/tree/main?tab=readme-ov-file#visual-instruction-tuning)
+
+Please download the annotation of the final mixture our instruction tuning data [llava_v1_5_mix665k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json), and download the images from constituting datasets:
+
+COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip the image folder to any directory you desire.
+
+<br />
+
+## 2. Run Examples
+Enter into the examples folder and install requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+- **Default Settings:**
+```bash
+CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name liuhaotian/llava-v1.5-7b  --bits 4 --group_size 128
+```
+
+## 3. Results
+Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. When the vision components are not involved in quantization, it is able to achieve accuracy loss within 1%. The results for fake quantized LLava-7b are as follows:
+| Model | Config | Precision | Hyperparameter | Accuracy% | Relative drop |
+|  :----: | :----: | :----: | :----: | :----: | :----: |
+| liuhaotian/llava-v1.5-7b | - | FP16 | - | 58.21 | - |
+| liuhaotian/llava-v1.5-7b | W4G128 | FP16 | with vision | 56.39 | -3.13% |
+| liuhaotian/llava-v1.5-7b | W4G128 | FP16 | w/o vision | 58.08 | -0.22% |
+
+
+## 4. Known Issues
+* huggingface format model is not support yet, e.g. llava-1.5-7b-hf
+* Setting seqlen to 2048 is not working yet.
+
+
+## 5. Environment
+
+PyTorch 1.8 or higher version is needed
+
+
+## Reference
+If you find SignRound useful for your research, please cite our paper:
+```bash
+@article{cheng2023optimize,
+  title={Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs},
+  author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao},
+  journal={arXiv preprint arXiv:2309.05516},
+  year={2023}
+}
+```
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
new file mode 100644
index 00000000000..07417f013d9
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
@@ -0,0 +1,385 @@
+import argparse
+# import sys
+parser = argparse.ArgumentParser()
+import torch
+import os
+import transformers
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+torch.use_deterministic_algorithms(True, warn_only=True)
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
+from transformers import set_seed
+
+import re
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import copy
+from PIL import Image
+import json
+from torch.utils.data import Dataset, DataLoader
+from llava.mm_utils import get_model_name_from_path
+from llava.train.train import preprocess, preprocess_multimodal, DataCollatorForSupervisedDataset
+from llava.model.builder import load_pretrained_model
+from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
+                                                    to_device,
+                                                    to_dtype,
+                                                    get_layer_names_in_block,
+                                                    detect_device,
+                                                    run_fn_for_vlm_autoround
+                                                    )
+from neural_compressor.torch.quantization import (AutoRoundConfig,
+                                                    prepare,
+                                                    convert,
+                                                    load)
+
+
+def save_tower(model, save_path, quant_vision: bool = False, max_shard_size: str = "5GB", safe_serialization: bool = True):
+    if not quant_vision:
+        print("Won't save vision_tower since this part was not quantized.")
+        return
+    ori_path = save_path
+    ori_tower_name = model.get_vision_tower().vision_tower_name
+    vision_tower = model.get_vision_tower().vision_tower
+    save_path = f'{save_path}-vision_tower'
+    os.makedirs(save_path, exist_ok=True)
+    quantization_config = model.config.quantization_config
+    redundant_prefix = "model.vision_tower.vision_tower."
+    org_block_list = copy.deepcopy(quantization_config['quant_block_list'])
+    # prepare vision_tower quantize list
+    quant_block_list = [element.split(redundant_prefix)[1] if redundant_prefix in element else "" \
+                        for sublist in org_block_list for element in sublist]
+    quant_block_list = [[element for element in quant_block_list if element != ""]]
+    quantization_config['quant_block_list'] = quant_block_list
+    if hasattr(vision_tower, "config"):
+        from transformers import AutoProcessor
+        processor = AutoProcessor.from_pretrained(ori_tower_name)
+        processor.save_pretrained(save_path)
+        vision_tower.config.quantization_config = quantization_config
+        vision_tower.config.save_pretrained(save_path)
+    vision_tower.save_pretrained(save_path, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+    # prepare llava model quantize list
+    quant_block_list = [element if redundant_prefix not in element else "" \
+                        for sublist in org_block_list for element in sublist]
+    quant_block_list = [[element for element in quant_block_list if element != ""]]
+    quantization_config['quant_block_list'] = quant_block_list
+    model.config.mm_vision_tower = save_path
+    model.config.save_pretrained(ori_path)
+    
+
+class CustomDataset(Dataset): # for llava tuning
+    # much refer to https://github.com/haotian-liu/LLaVA/blob/main/llava/train/train.py
+    def __init__(self, list_data_dict, image_folder, tokenizer, image_processor, args):
+        self.list_data_dict = list_data_dict
+        self.image_folder = image_folder
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.args = args
+        self.args.is_multimodal = args.is_multimodal
+
+    def __getitem__(self, index):
+        sources = self.list_data_dict[index]
+        # image = None
+        image_file = os.path.basename(sources["image"])
+        try:
+            image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
+            image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        except Exception as error:
+            print(f"{error}, skipped by set image to None")
+            image = None
+        sources = preprocess_multimodal(
+            copy.deepcopy([sources["conversations"]]), # a list
+            self.args,
+        )
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=('image' in self.list_data_dict[index]),
+        )
+        if isinstance(index, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                            labels=data_dict["labels"][0])
+        # image exist in the data
+        data_dict['image'] = image
+        return data_dict
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+
+def create_data_loader(dataset, batch_size=1, data_collator=None):
+    assert batch_size == 1, "batch_size must be 1"
+    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
+    return data_loader
+
+if __name__ == '__main__':
+
+    parser.add_argument("--model_name", default="liuhaotian/llava-v1.5-7b")
+    
+    parser.add_argument("--quantize", action="store_true")
+    
+    parser.add_argument("--accuracy", action="store_true")
+
+    parser.add_argument("--bits", default=4, type=int,
+                        help="number of  bits")
+
+    parser.add_argument("--group_size", default=128, type=int,
+                        help="group size")
+
+    parser.add_argument("--train_bs", default=1, type=int,
+                        help="train batch size")
+
+    parser.add_argument("--eval_bs", default=4, type=int,
+                        help="eval batch size")
+
+    parser.add_argument("--device", default="auto", type=str,
+                        help="The device to be used for tuning. The default is set to auto/None,"
+                             "allowing for automatic detection. Currently, device settings support CPU, GPU, and HPU.")
+
+    parser.add_argument("--sym", action='store_true',
+                        help=" sym quantization")
+
+    parser.add_argument("--iters", default=200, type=int,
+                        help=" iters")
+
+    parser.add_argument("--lr", default=None, type=float,
+                        help="learning rate, if None, it will be set to 1.0/iters automatically")
+
+    parser.add_argument("--minmax_lr", default=None, type=float,
+                        help="minmax learning rate, if None,it will beset to be the same with lr")
+
+    parser.add_argument("--seed", default=42, type=int,
+                        help="seed")
+
+    parser.add_argument("--eval_fp16_baseline", action='store_true',
+                        help="whether to eval FP16 baseline")
+
+    parser.add_argument("--adam", action='store_true',
+                        help="adam")
+
+    parser.add_argument("--seqlen", default=512, type=int,
+                        help="sequence length")
+
+    parser.add_argument("--gradient_accumulate_steps", default=1, type=int, help="gradient accumulate steps")
+
+    parser.add_argument("--nblocks", default=1, type=int, help="num of blocks to tune together")
+
+    parser.add_argument("--nsamples", default=512, type=int,
+                        help="number of samples")
+
+    parser.add_argument("--low_gpu_mem_usage", action='store_true',
+                        help="low_gpu_mem_usage is deprecated")
+
+    parser.add_argument("--export_format", default='auto_round:gptq', type=str,
+                        help="targeted inference acceleration platform,The options are 'fake', 'cpu', 'gpu', 'xpu' and 'auto_round'."
+                             "default to 'fake', indicating that it only performs fake quantization and won't be exported to any device.")
+
+    parser.add_argument("--scale_dtype", default='fp16',
+                        help="which scale data type to use for quantization, 'fp16', 'fp32' or 'bf16'.")
+
+    parser.add_argument("--output_dir", default="./tmp_autoround", type=str,
+                        help="Where to store the final model.")
+
+    parser.add_argument("--disable_eval", action='store_true',
+                        help="Whether to do lmeval evaluation.")
+
+    parser.add_argument("--disable_amp", action='store_true',
+                        help="disable amp")
+
+    parser.add_argument("--disable_minmax_tuning", action='store_true',
+                        help="whether disable  enable weight minmax tuning")
+
+    parser.add_argument("--disable_trust_remote_code", action='store_true',
+                        help="Whether to disable trust_remote_code")
+
+    parser.add_argument("--disable_quanted_input", action='store_true',
+                        help="whether to disuse the output of quantized block to tune the next block")
+
+    parser.add_argument("--quant_lm_head", action='store_true',
+                        help="quant_lm_head")
+
+    parser.add_argument("--model_dtype", default=None, type=str,
+                        help="force to convert the dtype, some backends supports fp16 dtype better")
+    
+    parser.add_argument("--act_bits", default=32, type=int,
+                    help="activation bits")
+    
+    parser.add_argument("--is_multimodal", type=bool, default=True,
+                        help="To determine whether the preprocessing should handle multimodal infomations.")
+    
+    parser.add_argument("--quant_vision", action='store_true',
+                        help="To determine whether the quantization should handle vision component.")
+    
+    # ========== Calibration Datasets ============= 
+    parser.add_argument("--mm-use-im-start-end", type=bool, default=False)
+    
+    parser.add_argument("--image_folder", default="coco", type=str,
+                        help="The dataset for quantization training. It can be a custom one.")
+    
+    parser.add_argument("--question_file", default=None, type=str,
+                            help="The dataset for quantization training. It can be a custom one.")
+    
+    # ================= Evaluation Related =====================
+    parser.add_argument("--eval-question-file", type=str, default="tables/question.jsonl")
+    
+    parser.add_argument("--eval-image-folder", type=str)
+    
+    parser.add_argument('--eval-result-file', type=str)
+    
+    parser.add_argument('--eval-annotation-file', type=str)
+
+    args = parser.parse_args()
+
+    if args.quantize:
+        set_seed(args.seed)
+
+        if args.act_bits <= 8:
+            print(
+                "Warning, activation quantization is an experiment feature")
+        
+        if args.act_bits <= 8 and args.export_format != "fake":
+            assert False, "only support fake mode for activation quantization currently"
+            
+        if "marlin" in args.export_format and args.sym == False:
+            assert False, "marlin backend only supports sym quantization, please set --sym"
+            
+        model_name = args.model_name
+        if model_name[-1] == "/":
+            model_name = model_name[:-1]
+        print(model_name, flush=True)
+
+        from auto_round.utils import detect_device
+
+        device_str = detect_device(args.device)
+        torch_dtype = "auto"
+        torch_device = torch.device(device_str)
+        model_path = args.model_name
+        model_name = get_model_name_from_path(model_path)
+        tokenizer, model, image_processor, _ = load_pretrained_model(model_path, model_base=None, model_name=model_name,
+                torch_dtype=torch_dtype)
+
+        model = model.eval()
+
+        if args.model_dtype != None:
+            if args.model_dtype == "float16" or args.model_dtype == "fp16":
+                model = model.to(torch.float16)
+            if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
+                model = model.to(torch.bfloat16)
+                
+        seqlen = args.seqlen
+        if hasattr(tokenizer, "model_max_length"):
+            if tokenizer.model_max_length < seqlen:
+                print(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length",
+                    flush=True)
+                seqlen = min(seqlen, tokenizer.model_max_length)
+                args.seqlen = seqlen
+
+        excel_name = f"{model_name}_{args.bits}_{args.group_size}"
+        pt_dtype = torch.float16
+        if (hasattr(model, 'config') and (model.dtype is torch.bfloat16 or model.config.torch_dtype is torch.bfloat16)):
+            dtype = 'bfloat16'
+            pt_dtype = torch.bfloat16
+        else:
+            if str(args.device) != "cpu":
+                pt_dtype = torch.float16
+                dtype = 'float16'
+            else:
+                pt_dtype = torch.float32
+                dtype = 'float32'
+
+        questions = json.load(open(args.question_file, "r"))
+        dataset = CustomDataset(questions, args.image_folder, tokenizer, image_processor, args=args)
+        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+        dataloader = create_data_loader(dataset, args.train_bs, data_collator)
+
+        quant_block_list = get_multimodal_block_names(model, args.quant_vision)
+            
+        quant_config = AutoRoundConfig(bits=args.bits, use_sym=args.sym, batch_size=args.train_bs, group_size=args.group_size,
+                            seqlen=seqlen, nblocks=args.nblocks, iters=args.iters, lr=args.lr,
+                            minmax_lr=args.minmax_lr, enable_quanted_input=not args.disable_quanted_input,
+                            nsamples=args.nsamples, seed=args.seed, gradient_accumulate_steps=args.gradient_accumulate_steps,
+                            scale_dtype=args.scale_dtype, enable_minmax_tuning=not args.disable_minmax_tuning, act_bits=args.act_bits,
+                            quant_block_list=quant_block_list, export_format=args.export_format)
+        
+        all_block_list = get_multimodal_block_names(model, quant_vision=True)
+        all_block_set = set(tuple(block) for block in all_block_list)
+        quant_block_set = set(tuple(block) for block in quant_block_list)
+        set_to_full_prec = list(all_block_set - quant_block_set)
+        set_to_full_prec = get_layer_names_in_block(model, quant_block_list=set_to_full_prec)
+        for name in set_to_full_prec:
+            quant_config.set_local(name, AutoRoundConfig(dtype="fp32"))
+            
+        # skip special layers
+        quant_config.set_local("model.mm_projector*", AutoRoundConfig(dtype="fp32"))
+            
+        for n, m in model.named_modules():
+            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
+                if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                    quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
+                    print(
+                        f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
+        
+        lm_head_layer_name = "lm_head"
+        if args.quant_lm_head:
+            from transformers import AutoConfig
+            config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+            if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
+                tied_keys = model._tied_weights_keys
+                for item in tied_keys:
+                    if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
+                        args.quant_lm_head = False
+                        print(
+                            f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
+                            f"supported currently")
+                        break
+                    
+        if not args.quant_lm_head:
+                quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
+                # layer_config[lm_head_layer_name] = {"bits": args.bits}
+                transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
+                if transformers_version[0] == 4 and transformers_version[1] < 38:
+                    error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
+                    raise EnvironmentError(error_message)
+        
+        run_args = (dataloader, seqlen, args.nsamples)
+        user_model = prepare(model=model, quant_config=quant_config)
+        run_fn_for_vlm_autoround(user_model, *run_args)
+        user_model = convert(user_model)
+
+        from neural_compressor.torch.utils import LoadFormat
+        save_tower(user_model, args.output_dir, quant_vision=args.quant_vision)
+        user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(args.output_dir)
+
+    if args.accuracy:
+        device_str = detect_device(args.device)
+        torch_device = torch.device(device_str)
+        model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code)
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=False)
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model() # replace vision_tower
+            vision_tower.to(device=model.device, dtype=model.dtype)
+        image_processor = vision_tower.image_processor
+        model = model.to(torch_device)
+        model_path = args.model_name
+        model_name = get_model_name_from_path(model_path)
+        # torch_dtype = "auto"
+        # tokenizer, model, image_processor, _ = load_pretrained_model(model_path, model_base=None, model_name=model_name,
+        #         torch_dtype=torch_dtype)
+        from mm_evaluation import TextVQAEvaluator
+        evaluator = TextVQAEvaluator(
+            model,
+            tokenizer,
+            image_processor,
+            args.eval_image_folder,
+            args.eval_question_file,
+            args.eval_annotation_file,
+            model_name = model_name
+        )
+        evaluator.run_evaluate(result_file = args.eval_result_file)
+        evaluator.calculate_accuracy(result_file = args.eval_result_file)
+
+
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/__init__.py
new file mode 100644
index 00000000000..42c010e5e21
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/__init__.py
@@ -0,0 +1 @@
+from .textvqa import TextVQAEvaluator
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/textvqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/textvqa.py
new file mode 100644
index 00000000000..2dd384603da
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/mm_evaluation/textvqa.py
@@ -0,0 +1,201 @@
+import sys
+import os
+import math
+from tqdm import tqdm
+import shortuuid
+import json
+import re
+
+from PIL import Image
+
+import torch
+from torch.utils.data import Dataset, DataLoader
+from llava.utils import disable_torch_init
+from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
+from llava.mm_utils import tokenizer_image_token, process_images
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+def collate_fn(batch):
+    input_ids, image_tensors, image_sizes = zip(*batch)
+    input_ids = torch.stack(input_ids, dim=0)
+    image_tensors = torch.stack(image_tensors, dim=0)
+    return input_ids, image_tensors, image_sizes
+
+class CustomDatasetTextVQA(Dataset):
+    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config, conv_mode):
+        self.questions = questions
+        self.image_folder = image_folder
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+        self.conv_mode = conv_mode
+
+    def __getitem__(self, index):
+        # import pdb;pdb.set_trace()
+        line = self.questions[index]
+        image_file = line["image"]
+        qs = line["text"]
+        if self.model_config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
+        image_tensor = process_images([image], self.image_processor, self.model_config)[0]
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+
+        return input_ids, image_tensor, image.size
+
+    def __len__(self):
+        return len(self.questions)
+
+class TextVQAEvaluator(object):
+    def __init__(
+            self, 
+            model, 
+            tokenizer, 
+            image_processor, 
+            image_folder,
+            question_file,
+            annotation_file, 
+            **kwargs
+        ):
+        super(TextVQAEvaluator, self).__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.image_folder = image_folder
+        self.question_file = question_file
+        self.annotation_file = annotation_file
+        # follow parameters can be set as default value.
+        self.model_name = kwargs.get("model_name", "llava")
+        self.conv_mode = kwargs.get("conv_mode", "vicuna_v1")
+        self.num_chunks = kwargs.get("num_chunks", 1)
+        self.chunk_idx = kwargs.get("chunk_idx", 0)
+        self.temperature = kwargs.get("temperature", 0)
+        self.top_p = kwargs.get("top_p", None)
+        self.num_beams = kwargs.get("num_beams", 1)
+        self.max_new_tokens = kwargs.get("max_new_tokens", 128)
+
+        if 'plain' in self.model_name and 'finetune' not in self.model_name.lower() and 'mmtag' not in self.conv_mode:
+            self.conv_mode = self.conv_mode + '_mmtag'
+            print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {self.conv_mode}.')
+
+    def create_dataloader(self):
+        questions = [json.loads(q) for q in open(os.path.expanduser(self.question_file), "r")]
+        questions = get_chunk(questions, self.num_chunks, self.chunk_idx)
+        dataset = CustomDatasetTextVQA(questions, self.image_folder, self.tokenizer, self.image_processor, self.model.config, self.conv_mode)
+        data_loader = DataLoader(dataset, batch_size=1, num_workers=4, shuffle=False, collate_fn=collate_fn)
+        return data_loader, questions
+
+    def run_evaluate(self, result_file = None):
+        disable_torch_init()
+        dataloader, questions = self.create_dataloader()
+        result_file = os.path.expanduser(result_file)
+        os.makedirs(os.path.dirname(result_file), exist_ok=True)
+        res_file = open(result_file, "w")
+        for (input_ids, image_tensor, image_sizes), line in tqdm(zip(dataloader, questions), total=len(questions)):
+            idx = line["question_id"]
+            cur_prompt = line["text"]
+
+            input_ids = input_ids.to(device='cuda', non_blocking=True)
+
+            with torch.inference_mode():
+                output_ids = self.model.generate(
+                    input_ids,
+                    images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
+                    image_sizes=image_sizes,
+                    do_sample=True if self.temperature > 0 else False,
+                    temperature=self.temperature,
+                    top_p=self.top_p,
+                    num_beams=self.num_beams,
+                    max_new_tokens=self.max_new_tokens,
+                    use_cache=True)
+
+            outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+            ans_id = shortuuid.uuid()
+            res_file.write(json.dumps({"question_id": idx,
+                                    "prompt": cur_prompt,
+                                    "text": outputs,
+                                    "answer_id": ans_id,
+                                    "model_id": self.model_name,
+                                    "metadata": {}}) + "\n")
+        res_file.close()
+
+    def prompt_processor(self, prompt):
+        if prompt.startswith('OCR tokens: '):
+            pattern = r"Question: (.*?) Short answer:"
+            match = re.search(pattern, prompt, re.DOTALL)
+            question = match.group(1)
+        elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
+            if prompt.startswith('Reference OCR token:'):
+                question = prompt.split('\n')[1]
+            else:
+                question = prompt.split('\n')[0]
+        elif len(prompt.split('\n')) == 2:
+            question = prompt.split('\n')[0]
+        else:
+            assert False
+
+        return question.lower()
+    
+    def calculate_accuracy(self, result_file = None):
+        experiment_name = os.path.splitext(os.path.basename(result_file))[0]
+        print(experiment_name)
+        annotations = json.load(open(self.annotation_file))['data']
+        annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
+        results = [json.loads(line) for line in open(result_file)]
+
+        pred_list = []
+        for result in results:
+            annotation = annotations[(result['question_id'], self.prompt_processor(result['prompt']))]
+            pred_list.append({
+                "pred_answer": result['text'],
+                "gt_answers": annotation['answers'],
+            })
+
+        evaluator = TextVQAAccuracyEvaluator()
+        print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
+
+
+
+# results
+
+
+
+# def eval_single(annotation_file, result_file):
+#     experiment_name = os.path.splitext(os.path.basename(result_file))[0]
+#     print(experiment_name)
+#     annotations = json.load(open(annotation_file))['data']
+#     annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
+#     results = [json.loads(line) for line in open(result_file)]
+
+#     pred_list = []
+#     for result in results:
+#         annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
+#         pred_list.append({
+#             "pred_answer": result['text'],
+#             "gt_answers": annotation['answers'],
+#         })
+
+#     evaluator = TextVQAAccuracyEvaluator()
+#     print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
new file mode 100644
index 00000000000..44750141fb4
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -x
+device=0
+
+CUDA_VISIBLE_DEVICES=$device \
+python3 main.py \
+--model_name=liuhaotian/llava-v1.5-7b \
+--bits 4 \
+--group_size 128 \
+--iters 200 \
+--seqlen 512 \
+--image_folder /path/to/coco/images/train2017/ \
+--question_file /path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k.json \
+--eval-question-file /path/to/textvqa/llava_textvqa_val_v051_ocr.jsonl \
+--eval-image-folder /path/to/textvqa/train_images \
+--eval-annotation-file /path/to/textvqa/TextVQA_0.5.1_val.json \
+--eval-result-file "./tmp_autoround" \
+--output_dir "./tmp_autoround"
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
new file mode 100644
index 00000000000..6ca61c1ce01
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
@@ -0,0 +1,115 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run [VLM quantization for Phi3-Vision](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) using AutoRound Quantization.
+
+# Run Quantization on Phi-3-vision Models
+
+In this example, we introduce an straight-forward way to execute quantization on some popular multimodal models such as Phi-3-vision. 
+
+## Download the calibration data
+
+Our calibration process resembles the official visual instruction tuning process.
+
+Please download the annotation of the final mixture our instruction tuning data [llava_v1_5_mix665k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json), and download the images from constituting datasets:
+
+COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip the image folder to any directory you desire.
+
+
+## 2. Run Examples
+PyTorch 1.8 or higher version is needed
+
+Enter into the examples folder and install lm-eval to run the evaluation
+```bash
+pip install -r requirements.txt
+```
+
+- **Default Settings:**
+```bash
+CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name microsoft/Phi-3-vision-128k-instruct  --bits 4 --group_size 128
+```
+
+
+## 3. Run Inference
+
+```python
+from PIL import Image
+import requests
+import io
+from transformers import AutoModelForCausalLM
+from transformers import AutoProcessor
+from auto_round.auto_quantizer import AutoHfQuantizer
+quantized_model_path = "./tmp_autoround"
+model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2') # use _attn_implementation='eager' to disable flash attention
+
+processor = AutoProcessor.from_pretrained(quantized_model_path, trust_remote_code=True)
+
+messages = [ \
+    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"}, \
+    {"role": "assistant", "content": "The chart displays the percentage of respondents who agree with various statements about their preparedness for meetings. It shows five categories: 'Having clear and pre-defined goals for meetings', 'Knowing where to find the information I need for a meeting', 'Understanding my exact role and responsibilities when I'm invited', 'Having tools to manage admin tasks like note-taking or summarization', and 'Having more focus time to sufficiently prepare for meetings'. Each category has an associated bar indicating the level of agreement, measured on a scale from 0% to 100%."}, \
+    {"role": "user", "content": "Provide insightful questions to spark discussion."}]
+
+url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" 
+# image = Image.open(requests.get(url, stream=True).raw)
+image = Image.open(io.BytesIO(requests.get(url, stream=True).content))
+
+prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
+
+generation_args = {
+    "max_new_tokens": 50,
+    "temperature": 0.0,
+    "do_sample": False,
+}
+
+generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) 
+
+# remove input tokens 
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 
+
+print(response)
+# 1. How does the level of agreement on each statement reflect the overall preparedness of respondents for meetings?
+# 2. What are the most and least agreed-upon statements, and why might that be the case?
+# 3.
+```
+<!-- 
+
+## 4. Results
+Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and lm_eval dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Phi-3-vision-128k-instruct are as follows:
+| Metric         | bf16   | INT4   |
+|----------------|--------|--------|
+| avg            | 0.6014 | 0.5940 |
+| mmlu           | 0.6369 | 0.6310 |
+| lambada_openai | 0.6487 | 0.6406 |
+| hellaswag      | 0.5585 | 0.5483 |
+| winogrande     | 0.7395 | 0.7451 |
+| piqa           | 0.7954 | 0.7889 |
+| truthfulqa_mc1 | 0.3084 | 0.2987 |
+| openbookqa     | 0.3580 | 0.3600 |
+| boolq          | 0.8532 | 0.8557 |
+| arc_easy       | 0.8371 | 0.8346 |
+| arc_challenge  | 0.5572 | 0.5469 |
+| cmmlu          | 0.4074 | 0.3950 |
+| ceval          | 0.4027 | 0.4012 |
+| gsm8k          | 0.7157 | 0.6755 | -->
+
+
+
+## Reference
+If you find SignRound useful for your research, please cite our paper:
+```bash
+@article{cheng2023optimize,
+  title={Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs},
+  author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao},
+  journal={arXiv preprint arXiv:2309.05516},
+  year={2023}
+}
+```
+
+
+
+
+
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/evaluation.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/evaluation.py
new file mode 100644
index 00000000000..3d542aa594e
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/eval/evaluation.py
@@ -0,0 +1,280 @@
+import itertools
+import logging
+import random
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, List, Optional, Union
+
+from packaging.version import Version
+import pkg_resources
+LM_EVAL_VERSION = Version(pkg_resources.get_distribution('lm_eval').version)
+
+import numpy as np
+import torch
+
+import lm_eval.api.metrics
+import lm_eval.api.registry
+import lm_eval.models
+from lm_eval.evaluator import evaluate
+from lm_eval.caching.cache import delete_cache
+from lm_eval.evaluator_utils import run_task_tests
+if LM_EVAL_VERSION == Version('0.4.2'):
+    from lm_eval.logging_utils import add_env_info, get_git_commit_hash
+else:
+    from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
+from lm_eval.tasks import TaskManager, get_task_dict
+from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
+
+if TYPE_CHECKING:
+    from lm_eval.api.model import LM
+    from lm_eval.tasks import Task
+
+
+@positional_deprecated
+def simple_evaluate(
+        model,
+        model_args: Optional[Union[str, dict]] = None,
+        tasks: Optional[List[Union[str, dict, object]]] = None,
+        num_fewshot: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        max_batch_size: Optional[int] = None,
+        device: Optional[str] = None,
+        use_cache: Optional[str] = None,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        delete_requests_cache: bool = False,
+        limit: Optional[Union[int, float]] = None,
+        bootstrap_iters: int = 100000,
+        check_integrity: bool = False,
+        write_out: bool = False,
+        log_samples: bool = True,
+        gen_kwargs: Optional[str] = None,
+        task_manager: Optional[TaskManager] = None,
+        verbosity: str = "INFO",
+        predict_only: bool = False,
+        random_seed: int = 0,
+        numpy_random_seed: int = 1234,
+        torch_random_seed: int = 1234,
+        user_model = None, ##user model does not support tensor parallelism
+):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param model: Union[str, LM]
+        Name of model or LM object, see lm_eval.models.get_model
+    :param model_args: Optional[str, dict]
+        String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
+        Ignored if `model` argument is a LM object.
+    :param tasks: list[Union[str, dict, Task]]
+        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME 
+        if defined and type(task).__name__ otherwise.
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param batch_size: int or str, optional
+        Batch size for model
+    :param max_batch_size: int, optional
+        Maximal batch size to try with automatic batch size detection
+    :param device: str, optional
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
+    :param use_cache: str, optional
+        A path to a sqlite db file for caching model responses. `None` if not caching.
+    :param cache_requests: bool, optional
+        Speed up evaluation by caching the building of dataset requests. `None` if not caching.
+    :param rewrite_requests_cache: bool, optional
+        Rewrites all of the request cache if set to `True`. `None` if not desired.
+    :param delete_requests_cache: bool, optional
+        Deletes all of the request cache if set to `True`. `None` if not desired.
+    :param limit: int or float, optional
+        Limit the number of examples per task (only use this for testing), If <1, 
+        limit is a percentage of the total number of examples.
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :param check_integrity: bool
+        Whether to run the relevant part of the test suite for the tasks
+    :param write_out: bool
+        If True, write out an example document and model input for checking task integrity
+    :param log_samples: bool
+        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param gen_kwargs: str
+        String arguments for model generation
+        Ignored for all tasks with loglikelihood output_type
+    :param predict_only: bool
+        If true only model outputs will be generated and returned. Metrics will not be evaluated
+    :param random_seed: int
+        Random seed for python's random module. If set to None, the seed will not be set.
+    :param numpy_random_seed: int
+        Random seed for numpy. If set to None, the seed will not be set.
+    :param torch_random_seed: int
+        Random seed for torch. If set to None, the seed will not be set.
+
+    :return
+        Dictionary of results
+    """
+    from auto_round.auto_quantizer import AutoHfQuantizer
+    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+    start_date = time.time()
+
+    if delete_requests_cache:
+        eval_logger.info("Deleting requests cache...")
+        delete_cache()
+
+    seed_message = []
+    if random_seed is not None:
+        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
+        seed_message.append(f"Setting random seed to {random_seed}")
+        random.seed(random_seed)
+
+    if numpy_random_seed is not None:
+        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
+        np.random.seed(numpy_random_seed)
+
+    if torch_random_seed is not None:
+        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
+        torch.manual_seed(torch_random_seed)
+
+    if seed_message:
+        eval_logger.info(" | ".join(seed_message))
+
+    if tasks is None:
+        tasks = []
+    if len(tasks) == 0:
+        raise ValueError(
+            "No tasks specified, or no tasks found. Please verify the task names."
+        )
+
+    if gen_kwargs is not None:
+        gen_kwargs = simple_parse_args_string(gen_kwargs)
+        eval_logger.warning(
+            "generation_kwargs specified through cli, these settings will update set parameters in yaml tasks. "
+            "Ensure 'do_sample=True' for non-greedy decoding!"
+        )
+        if gen_kwargs == "":
+            gen_kwargs = None
+
+    if isinstance(model, str):
+        if model_args is None:
+            model_args = ""
+
+        if isinstance(model_args, dict):
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+
+        else:
+            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
+                model_args,
+                {
+                    "batch_size": batch_size,
+                    "max_batch_size": max_batch_size,
+                    "device": device,
+                },
+            )
+    else:
+        if not isinstance(model, lm_eval.api.model.LM):
+            raise TypeError
+        lm = model
+
+    if use_cache is not None:
+        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
+        lm = lm_eval.api.model.CachingLM(
+            lm,
+            use_cache
+            # each rank receives a different cache db.
+            # necessary to avoid multiple writes to cache at once
+            + "_rank"
+            + str(lm.rank)
+            + ".db",
+        )
+    if user_model is not None:
+        lm._model = user_model
+
+    if task_manager is None:
+        task_manager = TaskManager(verbosity)
+
+    task_dict = get_task_dict(tasks, task_manager)
+    for task_name in task_dict.keys():
+        task_obj = task_dict[task_name]
+        if isinstance(task_obj, tuple):
+            _, task_obj = task_obj
+            if task_obj is None:
+                continue
+
+        if task_obj.get_config("output_type") == "generate_until":
+            if gen_kwargs is not None:
+                task_obj.set_config(
+                    key="generation_kwargs", value=gen_kwargs, update=True
+                )
+
+        if predict_only:
+            log_samples = True
+            eval_logger.info(
+                f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
+            )
+            # we have to change the class properties post-hoc. This is pretty hacky.
+            task_obj.override_metric(metric_name="bypass")
+
+        # override tasks' fewshot values to the provided num_fewshot arg value
+        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
+        if num_fewshot is not None:
+            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
+                eval_logger.info(
+                    f"num_fewshot has been set to 0 for {task_name} in its config."
+                    " Manual configuration will be ignored."
+                )
+            else:
+                eval_logger.warning(
+                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                )
+                task_obj.set_config(key="num_fewshot", value=num_fewshot)
+        else:
+            # if num_fewshot not provided, and the task does not define a default one, default to 0
+            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
+                task_obj.set_config(key="num_fewshot", value=0)
+
+    if check_integrity:
+        run_task_tests(task_list=tasks)
+
+    results = evaluate(
+        lm=lm,
+        task_dict=task_dict,
+        limit=limit,
+        cache_requests=cache_requests,
+        rewrite_requests_cache=rewrite_requests_cache,
+        bootstrap_iters=bootstrap_iters,
+        write_out=write_out,
+        log_samples=log_samples,
+        verbosity=verbosity,
+    )
+
+    if lm.rank == 0:
+        if isinstance(model, str):
+            model_name = model
+        elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
+            model_name = model.config._name_or_path
+        else:
+            model_name = type(model).__name__
+
+        # add info about the model and few shot config
+        results["config"] = {
+            "model": model_name,
+            "model_args": model_args,
+            "batch_size": batch_size,
+            "batch_sizes": (
+                list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
+            ),
+            "device": device,
+            "use_cache": use_cache,
+            "limit": limit,
+            "bootstrap_iters": bootstrap_iters,
+            "gen_kwargs": gen_kwargs,
+        }
+        results["git_hash"] = get_git_commit_hash()
+        results["date"] = start_date
+        add_env_info(results)  # additional environment info to results
+        return results
+    else:
+        return None
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
new file mode 100644
index 00000000000..351588eaeb5
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
@@ -0,0 +1,430 @@
+import argparse
+# import sys
+# sys.path.insert(0, '../../..')
+parser = argparse.ArgumentParser()
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+torch.use_deterministic_algorithms(True, warn_only=True)
+import copy
+from PIL import Image
+import json
+from torch.utils.data import Dataset, DataLoader
+import torch
+from typing import Dict, Optional, List, Union, Sequence
+import transformers
+from model.processing_phi3_v import Phi3VProcessor
+from dataclasses import dataclass, field
+from transformers import AutoModelForCausalLM, AutoConfig
+from transformers.trainer_pt_utils import LabelSmoother
+
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+import subprocess
+LLaVA_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_TOKEN = "<|image_1|>"
+IMAGE_TOKEN_INDEX = -200
+IGNORE_INDEX = -100
+from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
+                                                    to_device,
+                                                    to_dtype,
+                                                    get_layer_names_in_block,
+                                                    detect_device,
+                                                    run_fn_for_vlm_autoround
+                                                    )
+from neural_compressor.torch.quantization import (AutoRoundConfig,
+                                                    prepare,
+                                                    convert,
+                                                    load)
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data."}
+    )
+    lazy_preprocess: bool = True
+    is_multimodal: bool = True
+    image_folder: Optional[str] = field(default=None)
+    max_seq_length: int = field(
+        default=2048, metadata={"help": "Maximum sequence length."}
+    )
+    
+    
+def llava_to_openai(data):
+    role_mapping = {"human": "user", "gpt": "assistant"}
+
+    transformed_data = []
+    for entry in data:
+        transformed_entry = {
+            "role": role_mapping.get(entry["from"], entry["from"]),
+            "content": entry["value"].replace(LLaVA_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN),
+        }
+        transformed_data.append(transformed_entry)
+
+    return transformed_data
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        data_path: Union[int, str],
+        processor: transformers.PreTrainedTokenizer,
+        data_args: DataArguments,
+        padding=True,
+    ):
+        super(LazySupervisedDataset, self).__init__()
+        if isinstance(data_path, str):
+            list_data_dict = json.load(open(data_path, "r"))
+        else:
+            list_data_dict = data_path
+
+        print("Formatting inputs...Skip in lazy mode")
+        self.processor = processor
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+        self.padding = padding
+        self.max_seq_length = data_args.max_seq_length
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        processor = self.processor
+        if "image" in sources[0]:
+            image_file = os.path.basename(self.list_data_dict[i]["image"])
+            image_folder = self.data_args.image_folder
+            image_file = os.path.join(image_folder, image_file)
+            image = [Image.open(image_file).convert("RGB")]
+        else:
+            image = None
+        sources = copy.deepcopy([e["conversations"] for e in sources])
+        for i in range(len(sources)):
+            sources[i] = llava_to_openai(sources[i])
+
+        prompt = processor.tokenizer.apply_chat_template(
+            sources[0], tokenize=False, add_generation_prompt=True
+        )
+        data_dict = processor(prompt, image, return_tensors="pt")
+
+        if self.padding:
+            training_length = self.max_seq_length
+            if 'pixel_values' not in data_dict:
+                data_dict['pixel_values'] = torch.zeros([1, 17, 3, 336, 336], dtype=torch.bfloat16)
+                data_dict['image_sizes'] = torch.zeros([1, 2], dtype=torch.int64)
+            data_dict = dict(
+                input_ids=data_dict["input_ids"][0],
+                attention_mask=data_dict["attention_mask"][0],
+                pixel_values=data_dict["pixel_values"][0],
+                image_sizes=data_dict["image_sizes"][0],
+                labels=data_dict["labels"][0],
+            )
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels,
+            batch_first=True,
+            padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+
+        pixel_values = [instance["pixel_values"] for instance in instances]
+        pixel_values = torch.stack(pixel_values, dim=0)
+        image_sizes = [instance["image_sizes"] for instance in instances]
+        image_sizes = torch.stack(image_sizes, dim=0)
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        return batch
+
+
+def create_data_loader(dataset, batch_size=1, data_collator=None):
+    assert batch_size == 1, "batch_size must be 1"
+    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
+    return data_loader
+
+
+if __name__ == '__main__':
+
+    parser.add_argument(
+        "--model_name", default="microsoft/Phi-3-vision-128k-instruct"
+    )
+    
+    parser.add_argument("--quantize", action="store_true")
+    
+    parser.add_argument("--accuracy", action="store_true")
+
+    parser.add_argument("--bits", default=4, type=int,
+                        help="number of  bits")
+
+    parser.add_argument("--group_size", default=128, type=int,
+                        help="group size")
+
+    parser.add_argument("--train_bs", default=1, type=int,
+                        help="train batch size")
+
+    parser.add_argument("--eval_bs", default=4, type=int,
+                        help="eval batch size")
+
+    parser.add_argument("--device", default="auto", type=str,
+                        help="The device to be used for tuning. The default is set to auto/None,"
+                             "allowing for automatic detection. Currently, device settings support CPU, GPU, and HPU.")
+
+    parser.add_argument("--sym", action='store_true',
+                        help=" sym quantization")
+
+    parser.add_argument("--iters", default=200, type=int,
+                        help=" iters")
+
+    parser.add_argument("--lr", default=None, type=float,
+                        help="learning rate, if None, it will be set to 1.0/iters automatically")
+
+    parser.add_argument("--minmax_lr", default=None, type=float,
+                        help="minmax learning rate, if None,it will beset to be the same with lr")
+
+    parser.add_argument("--seed", default=42, type=int,
+                        help="seed")
+
+    parser.add_argument("--eval_fp16_baseline", action='store_true',
+                        help="whether to eval FP16 baseline")
+
+    parser.add_argument("--adam", action='store_true',
+                        help="adam")
+
+    parser.add_argument("--seqlen", default=2048, type=int,
+                        help="sequence length")
+
+    parser.add_argument("--gradient_accumulate_steps", default=1, type=int, help="gradient accumulate steps")
+
+    parser.add_argument("--nblocks", default=1, type=int, help="num of blocks to tune together")
+
+    parser.add_argument("--nsamples", default=512, type=int,
+                        help="number of samples")
+
+    parser.add_argument("--low_gpu_mem_usage", action='store_true',
+                        help="low_gpu_mem_usage is deprecated")
+
+    parser.add_argument("--export_format", default='auto_round:gptq', type=str,
+                        help="targeted inference acceleration platform,The options are 'fake', 'cpu', 'gpu', 'xpu' and 'auto_round'."
+                             "default to 'fake', indicating that it only performs fake quantization and won't be exported to any device.")
+
+    parser.add_argument("--scale_dtype", default='fp16',
+                        help="which scale data type to use for quantization, 'fp16', 'fp32' or 'bf16'.")
+
+    parser.add_argument("--output_dir", default="./tmp_autoround", type=str,
+                        help="Where to store the final model.")
+
+    parser.add_argument("--disable_eval", action='store_true',
+                        help="Whether to do lmeval evaluation.")
+
+    parser.add_argument("--disable_amp", action='store_true',
+                        help="disable amp")
+
+    parser.add_argument("--disable_minmax_tuning", action='store_true',
+                        help="whether disable  enable weight minmax tuning")
+
+    parser.add_argument("--disable_trust_remote_code", action='store_true',
+                        help="Whether to disable trust_remote_code")
+
+    parser.add_argument("--disable_quanted_input", action='store_true',
+                        help="whether to disuse the output of quantized block to tune the next block")
+
+    parser.add_argument("--quant_lm_head", action='store_true',
+                        help="quant_lm_head")
+
+    parser.add_argument("--model_dtype", default=None, type=str,
+                        help="force to convert the dtype, some backends supports fp16 dtype better")
+    
+    parser.add_argument("--model_max_length", default=2048, type=int,
+                        help="")
+    
+    parser.add_argument("--act_bits", default=32, type=int,
+                    help="activation bits")
+    
+    parser.add_argument("--quant_vision", action='store_true',
+                        help="To determine whether the quantization should handle vision component.")
+    
+    parser.add_argument("--enable_safe_serialization", action='store_true',
+                        help="To determine whether the save_pretrained process should use safe_serialization.")
+    
+    # ========== Calibration Datasets ============= 
+    parser.add_argument("--image_folder", default="coco", type=str,
+                        help="The dataset for quantization training. It can be a custom one.")
+    
+    parser.add_argument("--question_file", default=None, type=str,
+                            help="The dataset for quantization training. It can be a custom one.")
+    
+    # ================= Evaluation Related =====================
+    parser.add_argument("--tasks", #wikitext
+                        default="lambada_openai,hellaswag,winogrande,piqa,mmlu,truthfulqa_mc1," \
+                                "truthfulqa_mc2,openbookqa,boolq,rte,arc_easy,arc_challenge",
+                        help="lm-eval tasks for lm_eval version 0.4")
+    
+    parser.add_argument("--eval-dataset", type=str, default="textvqa_val")
+
+    args = parser.parse_args()
+
+    if args.quantize:
+        tasks = args.tasks
+            
+        if args.act_bits <= 8:
+            print(
+                "Warning, activation quantization is an experiment feature")
+        
+        if args.act_bits <= 8 and args.export_format != "fake":
+            assert False, "only support fake mode for activation quantization currently"
+            
+        if "marlin" in args.export_format and args.sym == False:
+            assert False, "marlin backend only supports sym quantization, please set --sym"
+
+        model_name = args.model_name
+        if model_name[-1] == "/":
+            model_name = model_name[:-1]
+        print(model_name, flush=True)
+
+        device_str = detect_device(args.device)
+        torch_device = torch.device(device_str)
+        
+        torch.manual_seed(1234)
+        model_name = args.model_name
+        seqlen = args.seqlen
+        questions = json.load(open(args.question_file, "r"))
+        torch_dtype = "auto"
+        if "hpu" in device_str:
+            torch_dtype = torch.bfloat16
+        if args.model_dtype != None:
+            if args.model_dtype == "float16" or args.model_dtype == "fp16":
+                torch_dtype = torch.float16
+            if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
+                torch_dtype = torch.bfloat16
+                
+        config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+        )
+        config.use_cache = False
+        
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+            trust_remote_code=not args.disable_trust_remote_code,
+            config=config,
+        )
+        seqlen = args.seqlen
+        processor = Phi3VProcessor.from_pretrained(model_name)
+        tokenizer = processor.tokenizer
+        data_args = DataArguments(
+            data_path=args.question_file,
+            is_multimodal=True,
+            image_folder=args.image_folder,
+            max_seq_length=seqlen,
+        )
+        dataset = LazySupervisedDataset(
+            data_path=args.question_file, processor=processor, data_args=data_args
+        )
+        data_collator = DataCollatorForSupervisedDataset(tokenizer=processor.tokenizer)
+        dataloader = create_data_loader(dataset, batch_size=args.train_bs, data_collator=data_collator)
+            
+        quant_block_list = get_multimodal_block_names(model, args.quant_vision)
+        
+        quant_config = AutoRoundConfig(bits=args.bits, use_sym=args.sym, batch_size=args.train_bs, group_size=args.group_size,
+                            seqlen=seqlen, nblocks=args.nblocks, iters=args.iters, lr=args.lr,
+                            minmax_lr=args.minmax_lr, enable_quanted_input=not args.disable_quanted_input,
+                            nsamples=args.nsamples, seed=args.seed, gradient_accumulate_steps=args.gradient_accumulate_steps,
+                            scale_dtype=args.scale_dtype, enable_minmax_tuning=not args.disable_minmax_tuning, act_bits=args.act_bits,
+                            quant_block_list=quant_block_list, export_format=args.export_format)
+        
+        all_block_list = get_multimodal_block_names(model, quant_vision=True)
+        all_block_set = set(tuple(block) for block in all_block_list)
+        quant_block_set = set(tuple(block) for block in quant_block_list)
+        set_to_full_prec = list(all_block_set - quant_block_set)
+        set_to_full_prec = get_layer_names_in_block(model, quant_block_list=set_to_full_prec)
+        for name in set_to_full_prec:
+            quant_config.set_local(name, AutoRoundConfig(dtype="fp32"))
+            
+        # skip special layers
+        quant_config.set_local("model.vision_embed_tokens.img_projection*", AutoRoundConfig(dtype="fp32"))
+            
+        for n, m in model.named_modules():
+            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
+                if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                    quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
+                    print(
+                        f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
+        
+        lm_head_layer_name = "lm_head"
+        if args.quant_lm_head:
+            from transformers import AutoConfig
+            config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+            if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
+                tied_keys = model._tied_weights_keys
+                for item in tied_keys:
+                    if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
+                        args.quant_lm_head = False
+                        print(
+                            f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
+                            f"supported currently")
+                        break
+                    
+        if not args.quant_lm_head:
+                quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
+                # layer_config[lm_head_layer_name] = {"bits": args.bits}
+                transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
+                if transformers_version[0] == 4 and transformers_version[1] < 38:
+                    error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
+                    raise EnvironmentError(error_message)
+        
+        run_args = (dataloader, seqlen, args.nsamples)
+        user_model = prepare(model=model, quant_config=quant_config)
+        run_fn_for_vlm_autoround(user_model, *run_args)
+        user_model = convert(user_model)
+
+        # user_model.save(args.output_dir, format="huggingface")
+        from neural_compressor.torch.utils import (LoadFormat,)
+        user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE, safe_serialization=False)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(args.output_dir)
+
+    
+    # if args.accuracy:
+    #     from eval.evaluation import simple_evaluate
+    #     device_str = detect_device(args.device)
+    #     tasks = args.tasks
+    #     if isinstance(tasks, str):
+    #         tasks = tasks.split(',')
+    #     model_args = f"pretrained={args.model_name}"
+    #     model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}"
+    #     user_model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code)
+    #     if args.act_bits <= 8:
+    #         user_model = model.to(device_str)
+
+    #     res = simple_evaluate(model="hf", model_args=model_args,
+    #                         tasks=tasks,
+    #                         batch_size=args.eval_bs, user_model=user_model)
+    #     from lm_eval.utils import make_table
+    #     print(make_table(res))
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/__init__.py
new file mode 100644
index 00000000000..dbe21cebce8
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/__init__.py
@@ -0,0 +1,4 @@
+if __name__ == "__main__":
+    import sys
+    sys.path.insert(0, './')
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/configuration_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/configuration_phi3_v.py
new file mode 100644
index 00000000000..573bb8d39fb
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/configuration_phi3_v.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Phi-3-V model configuration"""
+
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",
+}
+
+
+class Phi3VConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi3VModel`]. It is used to instantiate a Phi-3
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32064):
+            Vocabulary size of the Phi-3-V model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3VModel`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 32000):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 32000):
+            The id of the padding token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+        embd_layer (`str`, *optional*, defaults to `"default"`):
+            The embedding layer to use. Can be either `"default"` or `"image"`. "default" uses the standard embedding for text. 
+
+    Example:
+
+    ```python
+    >>> from transformers import Phi3VModel, Phi3VConfig
+
+    >>> # Initializing a Phi-3-V style configuration
+    >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-vision-128k-instruct")
+
+    >>> # Initializing a model from the configuration
+    >>> model = Phi3VModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "phi3_v"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        original_max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        bos_token_id=1,
+        eos_token_id=32000,
+        pad_token_id=32000,
+        sliding_window=None,
+        embd_layer: str = "default",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+        self.embd_layer = embd_layer
+
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+            )
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_embedding_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_embedding_phi3_v.py
new file mode 100644
index 00000000000..c2994c6ca05
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_embedding_phi3_v.py
@@ -0,0 +1,313 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, PretrainedConfig
+from transformers import CLIPVisionConfig 
+from transformers.utils import logging
+from datetime import datetime 
+
+logger = logging.get_logger(__name__)
+
+CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
+  attention_dropout=0.0,
+  dropout=0.0,
+  hidden_act="quick_gelu",
+  hidden_size=1024,
+  image_size=336,
+  initializer_factor=1.0,
+  initializer_range=0.02,
+  intermediate_size=4096,
+  layer_norm_eps=1e-05,
+  num_attention_heads=16,
+  num_channels=3,
+  num_hidden_layers=24,
+  patch_size=14,
+  projection_dim=768 
+)
+
+class Phi3ImageEmbedding(nn.Module):
+    """Phi3 Image embedding."""
+
+    def __init__(self, config: PretrainedConfig, wte=None, **kwargs) -> None:
+        super().__init__()
+
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(config, 'n_embd') else config.hidden_size
+        if hasattr(config, 'embd_pdrop') or hasattr(config, 'embed_pdrop'):
+            embd_drop = config.embd_pdrop if hasattr(config, 'embd_pdrop') else config.embed_pdrop
+            self.drop = nn.Dropout(embd_drop)
+        else:
+            self.drop = None
+
+        self.wte = wte
+
+        if isinstance(config.img_processor, dict) and config.img_processor.get('name', None) == 'clip_vision_model':
+            assert 'model_name' in config.img_processor, 'model_name must be provided for CLIPVisionModel'
+            assert 'image_dim_out' in config.img_processor, 'image_dim_out must be provided for CLIPVisionModel'
+            assert 'num_img_tokens' in config.img_processor, 'num_img_tokens must be provided for CLIPVisionModel'
+            assert config.img_processor['model_name'] == 'openai/clip-vit-large-patch14-336'
+            clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
+            self.img_processor = CLIPVisionModel(clip_config)
+            image_dim_out = config.img_processor['image_dim_out']
+            self.num_img_tokens = config.img_processor['num_img_tokens']
+        else:
+            raise NotImplementedError(f'img_processor = {config.img_processor}, not implemented')
+
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = kwargs.get('use_hd_transform', False)
+        self.with_learnable_separator = kwargs.get('with_learnable_separator', False)
+        self.hd_transform_order = kwargs.get('hd_transform_order', 'glb_sub')
+        # with_hd_transform and with_learnable_separator should have same value
+        assert self.use_hd_transform == self.with_learnable_separator, 'use_hd_transform and with_learnable_separator should have same value'
+        if self.with_learnable_separator:
+            assert self.use_hd_transform, 'learnable separator is only for hd transform'
+            # 1024 * 4, merge spatial to channel dimension
+            self.glb_GN = nn.Parameter(torch.zeros([1, 1, self.image_dim_out * 4]))
+            self.sub_GN = nn.Parameter(torch.zeros([1, 1, 1, self.image_dim_out * 4]))
+            logger.info(f'learnable separator enabled for hd transform, hd_transform_order = {self.hd_transform_order}')
+
+        projection_cls = kwargs.get('projection_cls', 'linear')
+        if projection_cls == 'linear':
+            self.img_projection = nn.Linear(image_dim_out, hidden_size)
+        elif projection_cls == 'mlp' and self.use_hd_transform:
+            dim_projection = hidden_size
+            depth = 2
+            layers = [nn.Linear(image_dim_out * 4, dim_projection)]
+            for _ in range(1, depth):
+                layers.extend([nn.GELU(),
+                                nn.Linear(dim_projection, dim_projection)])
+            self.img_projection = nn.Sequential(*layers)
+        elif projection_cls == 'mlp':
+            dim_projection = hidden_size
+            depth = 2
+            layers = [nn.Linear(image_dim_out, dim_projection)]
+            for _ in range(1, depth):
+                layers.extend([nn.GELU(),
+                                nn.Linear(dim_projection, dim_projection)])
+            self.img_projection = nn.Sequential(*layers)
+        else:
+            raise NotImplementedError(f'projection_cls = {projection_cls}, not implemented')
+
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+
+        if isinstance(config.img_processor, dict):
+            self.layer_idx = config.img_processor.get('layer_idx', -2)
+            self.type_feature = config.img_processor.get('type_feature', 'patch')
+        else:
+            self.layer_idx = -2
+            self.type_feature = 'patch'
+
+
+    def set_img_features(self, img_features: torch.FloatTensor) -> None:
+        self.img_features = img_features
+
+    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
+        self.img_sizes = img_sizes
+
+    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        LAYER_IDX = self.layer_idx
+        TYPE_FEATURE = self.type_feature
+
+        img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)
+        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+
+        if TYPE_FEATURE == "patch":
+            patch_feature = img_feature[:, 1:]
+            return patch_feature
+
+        if TYPE_FEATURE == "cls_patch":
+            return img_feature
+
+        raise NotImplementedError
+
+    def forward(self, input_ids: torch.LongTensor, pixel_values: torch.FloatTensor, image_sizes=None) -> torch.FloatTensor:
+
+        MAX_INPUT_ID = int(1e9)
+        img_embeds = pixel_values
+        img_sizes = image_sizes
+
+        if self.img_features is not None:
+            img_embeds = self.img_features.clone()
+            self.img_features = None
+
+        if self.img_sizes is not None:
+            img_sizes = self.img_sizes
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        with torch.no_grad():
+            positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=False)
+        
+        select = False
+
+        if isinstance(self.img_projection, nn.Sequential):  
+            target_device = self.img_projection[0].bias.device  
+            target_dtype = self.img_projection[0].bias.dtype  
+        else:  # It's a single nn.Linear layer  
+            target_device = self.img_projection.bias.device  
+            target_dtype = self.img_projection.bias.dtype  
+
+        if len(positions.tolist()) > 0:
+            with torch.no_grad():
+                g_values = abs(input_ids[positions[:, 0], positions[:, 1]])
+
+            if self.use_hd_transform and img_sizes is not None and len(img_sizes):
+                hd_transform = True
+                assert img_embeds.ndim == 5, f'img_embeds size: {img_embeds.size()}, expect 5D tensor for hd transform'
+                # img_embeds: (num_images, max_num_crops, 3, H, W)
+                # img_sizes: (num_images, 2).view(1, -1)
+
+                start_time = datetime.now()
+                bs = img_embeds.shape[0]
+                # Nx(HW)xC
+                img_features = self.get_img_features(img_embeds.flatten(0, 1))
+                base_feat_height = base_feat_width = int(img_features.shape[1] ** 0.5)
+
+                assert base_feat_height == 24 and base_feat_width == 24, f'base_feat_height: {base_feat_height}, base_feat_width: {base_feat_width}, expect 24x24 features for hd transform'
+
+                # bs x max_num_crops x (24x24) x C
+                img_features = img_features.view(bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
+                C = self.image_dim_out
+                H = base_feat_height
+
+                output_imgs = []
+                output_len = []
+                # training is tensor, inference is list
+                if isinstance(img_sizes, torch.Tensor):
+                    img_sizes = img_sizes.view(-1, 2)
+                num_pure_text = 0
+                for _bs in range(bs):
+                    h, w = img_sizes[_bs]
+                    h = h // 336 
+                    w = w // 336
+                    B_ = h * w
+                    if B_ == 0:
+                        num_pure_text += 1
+                        continue
+
+                    # 1 x (24x24) x 1024
+                    global_img_feature = img_features[_bs, :1]
+
+                    # 1 x 12 x 12 x 4096
+                    glb_img = global_img_feature.reshape(1,H,H,C).reshape(1,H//2,2,H//2,2,C).contiguous().permute(0,1,3,2,4,5).reshape(1,H//2,H//2,4*C).contiguous()
+                    temp_glb_GN = self.sub_GN.repeat(1, H//2, 1, 1)
+
+                    # 1 x 156 x 4096
+                    glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(1,-1,4*C)
+
+                    # (max_num_crops-1) x (12x12) x C
+                    sub_img = img_features[_bs, 1:]
+                    # 16x574x1024
+                    # get rid of padding sub_img
+                    sub_img = sub_img[:B_]
+
+                    # (num_crops, 12, 2, 12, 2, 1024) -> (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
+                    sub_img = sub_img.reshape(B_,H,H,C).reshape(B_,H//2,2,H//2,2,C).contiguous().permute(0,1,3,2,4,5).reshape(B_,-1,4*C).contiguous()
+                    sub_img = sub_img.reshape(1, h, w, 12, 12, -1).permute(0,1,3,2,4,5).reshape(1,h*12,w*12,4*C)
+                    temp_sub_GN = self.sub_GN.repeat(1, h*12, 1, 1)
+                    sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(1,-1,4*C)
+                    # (1, num_img_tokens, 1024*4)
+
+                    # glb + sub
+                    if self.hd_transform_order == 'glb_sub':
+                        output_imgs.append(torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+                    elif self.hd_transform_order == 'sub_glb':
+                        output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+                    else:
+                        raise NotImplementedError(f'hd_transform_order = {self.hd_transform_order}, not implemented')
+
+                    temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
+                    assert temp_len == output_imgs[-1].shape[1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'
+                    output_len.append(temp_len)
+                
+                num_img_tokens = output_len
+                img_set_tensor = []
+                for _output_img in output_imgs:
+                    img_feature_proj = self.img_projection(_output_img.to(target_device).to(target_dtype))
+                    img_set_tensor.append(img_feature_proj)
+                # logger.info(f'img_embeds size: {img_embeds.size()}, image sizes: {img_sizes} loading time {datetime.now() - start_time}')
+            elif img_embeds.ndim == 4:
+                selected_g_values = g_values[::self.num_img_tokens]
+                assert len(img_embeds) == len(selected_g_values), f'img_embeds size: {img_embeds.size()}, selected_g_values size: {len(selected_g_values)}, selected_g_value {selected_g_values}'
+                start_time = datetime.now()
+                tt = (
+                    self.get_img_features(img_embeds)
+                    .to(target_device)
+                    .to(target_dtype)
+                    .reshape(-1, self.image_dim_out)
+                )
+                # logger.info(f'img_embeds size: {img_embeds.size()}, loading time {datetime.now() - start_time}')
+                img_set_tensor = self.img_projection(tt)  # adapted visual features.
+            elif img_embeds.ndim == 3:
+                selected_g_values = g_values[::self.num_img_tokens]
+                assert len(img_embeds) == len(selected_g_values), f'img_embeds size: {img_embeds.size()}, selected_g_values size: {len(selected_g_values)}, selected_g_value {selected_g_values}'
+                tt = (
+                    img_embeds
+                    .to(target_device)
+                    .to(target_dtype)
+                    .view(-1, self.image_dim_out)
+                )
+                img_set_tensor = self.img_projection(tt)  # adapted visual features.
+            else:
+                raise NotImplementedError
+            select = True
+        # It's a hacky way to walkaround the hang-out problem when deepspeed `zero3` is used
+        # and the training batch is a mixture of pure text and vision-language data.
+        else:
+            num_pure_text = input_ids.shape[0]
+            self.get_img_features(img_embeds.flatten(0, 1))
+        for _ in range(num_pure_text):
+            self.img_projection(torch.zeros(1, 1921, 4096, device=self.img_processor.device, dtype=self.img_processor.dtype))
+        with torch.no_grad():
+            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
+        
+        hidden_states = self.wte(input_ids)
+
+        if select:
+            if hd_transform:
+                idx = 0
+                for i, cnt in enumerate(num_img_tokens):
+                    # see https://github.com/GaiZhenbiao/Phi3V-Finetuning/pull/5
+                    hidden_states = hidden_states.clone()
+                    hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = (
+                        img_set_tensor[i]
+                        .to(hidden_states.dtype)
+                        .to(hidden_states.device)
+                    )
+                    idx += cnt
+            else:
+                idx = 0
+                assert len(selected_g_values) * self.num_img_tokens == len(img_set_tensor), f'len(selected_g_values) * self.num_img_tokens = {len(selected_g_values) * self.num_img_tokens}, len(img_set_tensor) = {len(img_set_tensor)}'
+                for i, g in enumerate(selected_g_values):
+                    cnt = self.num_img_tokens
+                    hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = (
+                        img_set_tensor[i * cnt : (i + 1) * cnt]
+                        .to(hidden_states.dtype)
+                        .to(hidden_states.device)
+                        )
+                    idx += cnt
+
+        if self.drop is not None:
+            hidden_states = self.drop(hidden_states)
+
+        return hidden_states
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_processing_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_processing_phi3_v.py
new file mode 100644
index 00000000000..821096527f7
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/image_processing_phi3_v.py
@@ -0,0 +1,274 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Image processor class for Phi3-V."""
+
+from typing import List, Optional, Union
+
+import numpy as np
+from auto_round.utils import transformers
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ImageInput,
+    make_list_of_images,
+    valid_images,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+
+from transformers import AutoImageProcessor
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    from PIL import Image
+
+import torch
+import torchvision
+
+def padding_336(b):
+    width, height = b.size
+    tar = int(np.ceil(height / 336) * 336)
+    top_padding = int((tar - height)/2)
+    bottom_padding = tar - height - top_padding
+    left_padding = 0
+    right_padding = 0
+    b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
+
+    return b
+
+def calc_padded_size(width, height, padding_unit=336):  
+    target_height = int(np.ceil(height / padding_unit) * padding_unit)  
+    top_padding = int((target_height - height) / 2)  
+    bottom_padding = target_height - height - top_padding  
+    left_padding = 0  
+    right_padding = 0  
+    padded_width = width + left_padding + right_padding  
+    padded_height = height + top_padding + bottom_padding  
+    return padded_width, padded_height  
+
+def HD_transform(img, hd_num=16):
+    width, height = img.size
+    trans = False
+    if width < height:
+        img = img.transpose(Image.TRANSPOSE)
+        trans = True
+        width, height = img.size
+    ratio = (width/ height)
+    scale = 1
+    while scale*np.ceil(scale/ratio) <= hd_num:
+        scale += 1
+    scale -= 1
+    new_w = int(scale * 336)
+    new_h = int(new_w / ratio)
+
+    img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
+    img = padding_336(img)
+    width, height = img.size
+    if trans:
+        img = img.transpose(Image.TRANSPOSE)
+
+    return img
+
+def calc_hd_transform_size(width, height, hd_num=16):  
+    transposed = False  
+    if width < height:  
+        width, height = height, width  
+        transposed = True  
+  
+    ratio = width / height  
+    scale = 1  
+    while scale * np.ceil(scale / ratio) <= hd_num:  
+        scale += 1  
+    scale -= 1  
+  
+    new_width = int(scale * 336)  
+    new_height = int(new_width / ratio)  
+  
+    padded_width, padded_height = calc_padded_size(new_width, new_height)  
+      
+    if transposed:  
+        padded_width, padded_height = padded_height, padded_width  
+  
+    return padded_width, padded_height  
+
+def pad_to_max_num_crops_tensor(images, max_crops=5):
+    """
+    images: B x 3 x H x W, B<=max_crops
+    """
+    B, _, H, W = images.shape
+    if B < max_crops:
+        pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
+        images = torch.cat([images, pad], dim=0)
+    return images
+
+
+class Phi3VImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Phi3 image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques
+    for processing high resolution images as explained in the [InternLM-XComposer2-4KHD](https://arxiv.org/abs/2401.16420)
+
+    Args:
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        num_crops: int = 1,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_crops = num_crops
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+    
+    def calc_num_image_tokens(
+            self, 
+            images: ImageInput 
+    ):
+        """ Calculate the number of image tokens for each image.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+        """
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        images = [image.convert('RGB') for image in images]
+        # (H, W, C)
+        elems = [HD_transform(im, hd_num = self.num_crops) for im in images] 
+        shapes = [[im.size[1], im.size[0]] for im in elems]
+        num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
+        return num_img_tokens
+
+    def calc_num_image_tokens_from_image_size(self, width, height):
+        """
+        Calculate the number of image tokens for a given image size.
+        Args:
+            width (`int`): Width of the image.
+            height (`int`): Height of the image.
+        """
+        new_width, new_height = calc_hd_transform_size(width, height, hd_num=self.num_crops)  
+        num_img_tokens = int((new_height // 336 * new_width // 336 + 1) * 144 + 1 + (new_height // 336 + 1) * 12)  
+        return num_img_tokens
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        """
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        image_sizes = []
+        img_processor = torchvision.transforms.Compose([
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize(image_mean, image_std)
+        ])
+
+        # PIL images
+        # HD_transform pad images to size of multiiply of 336, 336
+        # convert to RGB first
+        images = [image.convert('RGB') for image in images]
+        elems = [HD_transform(im, hd_num = self.num_crops) for im in images] 
+        # tensor transform and normalize
+        hd_images = [img_processor(im) for im in elems]
+        # create global image 
+        global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(336, 336), mode='bicubic',).to(im.dtype) for im in hd_images]
+
+        # [(3, h, w)], where h, w is multiple of 336
+        shapes = [[im.size(1), im.size(2)] for im in hd_images]
+        num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
+        # reshape to channel dimension -> (num_images, num_crops, 3, 336, 336)
+        # (1, 3, h//336, 336, w//336, 336) -> (1, h//336, w//336, 3, 336, 336) -> (h//336*w//336, 3, 336, 336)
+        hd_images_reshape = [im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336).contiguous() for im, (h, w) in zip(hd_images, shapes)]
+        # concat global image and local image
+        hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
+
+        # pad to max_num_crops
+        image_transformed = [pad_to_max_num_crops_tensor(im, self.num_crops+1) for im in hd_images_reshape]
+        image_transformed = torch.stack(image_transformed, dim=0)
+        image_sizes = [torch.LongTensor(_shapes) for _shapes in shapes]
+        padded_images = image_transformed
+        image_sizes = shapes
+
+        data = {"pixel_values": padded_images, 
+                "image_sizes": image_sizes,
+                "num_img_tokens": num_img_tokens
+                }
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+AutoImageProcessor.register("Phi3VImageProcessor", Phi3VImageProcessor)
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/modeling_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/modeling_phi3_v.py
new file mode 100644
index 00000000000..0b5357b1655
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/modeling_phi3_v.py
@@ -0,0 +1,1634 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" PyTorch Phi-3-V model."""
+
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from .image_embedding_phi3_v import Phi3ImageEmbedding
+import transformers
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_phi3_v import Phi3VConfig
+
+
+
+logger = logging.get_logger(__name__)
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-vision-128k-instruct"
+_CONFIG_FOR_DOC = "Phi3VConfig"
+
+PHI3V_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/Phi-3-vision-128k-instruct",
+    # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3
+]
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
+class Phi3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Phi3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
+class Phi3RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.register_buffer("inv_freq", None, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = 0.1 * math.log(scale) + 1.0
+
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Phi3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Phi3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Phi3VConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = config.rope_scaling
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.rope_scaling is None:
+            self.rotary_emb = Phi3RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            if scaling_type == "su":
+                self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config)
+            elif scaling_type == "yarn":
+                self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config)
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.")
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Phi3FlashAttention2(Phi3Attention):
+    """
+    Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # Phi3FlashAttention2 attention does not support output_attentions
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention. Please use `attn_implementation='eager'` or upgrade flash-attn library."
+            )
+            raise ValueError("The current flash attention version does not support sliding window attention.")
+
+        output_attentions = False
+
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_dropout = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+
+        if query_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.qkv_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=attn_dropout,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3
+# TODO @Arthur no longer copied from LLama after static cache
+class Phi3SdpaAttention(Phi3Attention):
+    """
+    Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Phi3Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.num_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+PHI3_ATTENTION_CLASSES = {
+    "eager": Phi3Attention,
+    "flash_attention_2": Phi3FlashAttention2,
+    "sdpa": Phi3SdpaAttention,
+}
+
+
+class Phi3DecoderLayer(nn.Module):
+    def __init__(self, config: Phi3VConfig, layer_idx: int):
+        super().__init__()
+
+        self.config = config
+        self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+
+        self.mlp = Phi3MLP(config)
+        self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        attn_outputs, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+
+        hidden_states = residual + self.resid_attn_dropout(attn_outputs)
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+PHI3V_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Phi3VConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Phi-3-V model outputting raw hidden-states without any specific head on top.",
+    PHI3V_START_DOCSTRING,
+)
+class Phi3VPreTrainedModel(PreTrainedModel):
+    config_class = Phi3VConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+
+    _version = "0.0.5"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+PHI3V_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using [`AutoImageProcessor`]. 
+            See [`Phi3ImageProcessor.__call__`] for details. 
+        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`, *optional*):
+            The sizes of the images in the batch, being (height, width) for each image.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Phi-3-V model outputting raw hidden-states without any specific head on top.",
+    PHI3V_START_DOCSTRING,
+)
+class Phi3VModel(Phi3VPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`]
+
+    Args:
+        config: Phi3Config
+    """
+
+    def __init__(self, config: Phi3VConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.embed_dropout = nn.Dropout(config.embd_pdrop)
+
+        self.vision_embed_tokens = None
+        if isinstance(config.embd_layer, dict):
+            # vision embedding layer
+            embedding_config = {
+                'embedding_cls': config.embd_layer['embedding_cls'],
+                **config.embd_layer
+            }
+            self.vision_embed_tokens = Phi3ImageEmbedding(config, wte=self.embed_tokens, **embedding_config)
+            # # set wte the same for vision embedding 
+            # self.vision_embed_tokens.wte.weight = self.embed_tokens.weight
+
+        self.layers = nn.ModuleList(
+            [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(PHI3V_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_key_values_length = 0
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            if pixel_values is not None and image_sizes is not None:
+                assert self.vision_embed_tokens is not None, "Vision embedding layer is not defined"
+                inputs_embeds = self.vision_embed_tokens(input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
+            else:
+                inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    use_reentrant=False
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Phi3VForCausalLM(Phi3VPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi3VModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+    def get_decoder(self):
+        return self.model
+
+    # Ignore copy
+    @add_start_docstrings_to_model_forward(PHI3V_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Phi3ForCausalLM
+
+        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+
+        >>> prompt = "This is an example script ."
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, pixel_values=None, image_sizes=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "image_sizes": image_sizes,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The [`Phi3VModel`] with a sequence classification head on top (linear layer).
+
+    [`Phi3VForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    PHI3V_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
+class Phi3VForSequenceClassification(Phi3VPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3VModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(PHI3V_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        model_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = model_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + model_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=model_outputs.past_key_values,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    [`Phi3VModel`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    PHI3V_START_DOCSTRING,
+)
+# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
+class Phi3VForTokenClassification(Phi3VPreTrainedModel):
+    def __init__(self, config: Phi3VConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.model = Phi3VModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(PHI3V_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        model_outputs = self.model(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = model_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            batch_size, seq_length = labels.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (logits,) + model_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+        )
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/processing_phi3_v.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/processing_phi3_v.py
new file mode 100644
index 00000000000..1263845d1bc
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/model/processing_phi3_v.py
@@ -0,0 +1,296 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Modified by Github@GaiZhenbiao
+
+"""
+Processor class for Phi3-V.
+"""
+import re
+from typing import List, Optional, Union
+
+import torch
+from .image_processing_phi3_v import Phi3VImageProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
+from transformers.utils import TensorType
+import transformers
+transformers.Phi3VImageProcessor = Phi3VImageProcessor
+
+class Phi3VProcessor(ProcessorMixin):
+    r"""
+    Constructs a Phi3-V processor which wraps a Phi3-V image processor and a LLaMa tokenizer into a single processor.
+
+    [`Phi3VProcessor`] offers all the functionalities of [`Phi3VImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~Phi3VProcessor.__call__`] and [`~Phi3VProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`Phi3VImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Phi3VImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    special_image_token = "<|image|>"
+
+    def __init__(self, image_processor, tokenizer):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.num_img_tokens = image_processor.num_img_tokens
+        self.img_tokens = [f"<|image_{i+1}|>" for i in range(1000000)]
+
+    def __call__(
+        self,
+        text: Union[TextInput, List[TextInput]],
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        Phi3ImageProcessor's [`~Phi3ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is not None:
+            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+        else:
+            image_inputs = {}
+        inputs = self._convert_images_texts_to_inputs(image_inputs, text, padding=padding, truncation=truncation, max_length=max_length, return_tensors=return_tensors)
+        return inputs
+
+    def calc_num_image_tokens(self, images: ImageInput):
+        """ Calculate the number of image tokens for each image.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+        """
+        return self.image_processor.calc_num_image_tokens(images)
+
+    def calc_num_image_tokens_from_image_size(self, width, height):
+        """ Calculate the number of image token for an image with given width and height.
+        Args:
+            width (`int`):
+                Width of the image.
+            height (`int`):
+                Height of the image.
+        """
+        return self.image_processor.calc_num_image_tokens_from_image_size(width, height)
+
+
+    @property
+    def special_image_token_id(self):
+        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
+
+    def get_special_image_token_id(self):
+        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
+
+    def _convert_images_texts_to_inputs(self, images, texts, padding=False, truncation=None, max_length=None, return_tensors=None):
+
+        def split_with_separators(s, separators):
+            parts = []
+            start = 0
+            sep_len = {sep: len(sep) for sep in separators}
+            while start < len(s):
+                index = min((s.find(sep, start), sep) for sep in separators if s.find(sep, start) != -1)
+                if index[0] == -1:
+                    parts.append(s[start:])
+                    break
+                if s[start:index[0]]:
+                    parts.append(s[start:index[0]])
+                parts.append(index[1])
+                start = index[0] + sep_len[index[1]]
+            return parts
+
+        def split_with_roles(input_text):
+            parts = split_with_separators(input_text, ["<|user|>\n", "<|end|>\n", "<|assistant|>\n", "<|image_1|>"])
+            new_parts = []
+            current_role = None
+            for p in parts:
+                if p in ["<|user|>\n", "<|assistant|>\n", "<|end|>\n"]:
+                    if p == "<|user|>\n":
+                        current_role = "user"
+                    elif p == "<|assistant|>\n":
+                        current_role = "assistant"
+                    _type = ["<|user|>\n", "<|assistant|>\n", "<|end|>\n"].index(p) + 1
+                    new_parts.append({"role": current_role, "content": p, "type": _type})
+                else:
+                    new_parts.append({"role": current_role, "content": p, "type": 0})
+            return new_parts
+
+        if not len(images):
+            model_inputs = self.tokenizer(texts, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length)
+            # prompt_chunks = []
+            label_prompt_chunks = []
+            # the behavior of the tokenizer is very very weird, what I observed is concluded by the following:
+            # 1. "<|user|>\n" is encoded as 3 tokens, while "<|assistant|>\n" is encoded as 1 tokens
+            # 2. tokenizing "I am here" and "\nI am here", the tokens of "I" in these two cases are different ("I" can be any word and is used as an example here)
+            # 3. when tokenizing "<|user|>\nI am here", the tokens of "I" follow the tokenization of "I" in "\nI am here"
+            # 4. when tokenizing "<|assistant|>\nI am here", the tokens of "I" follow the tokenization of "I" in "I am here"
+            # [Edited by zhenwei - 2024-06-01 22:25]
+            for chunk in split_with_roles(texts):
+                if chunk["role"] == "assistant" and chunk['type'] in [0, 3]:
+                    tmp_input_ids = self.tokenizer(chunk["content"], add_special_tokens=False).input_ids
+                    # prompt_chunks.append(tmp_input_ids)
+                    label_prompt_chunks.append(tmp_input_ids)
+                else:
+                    tmp_input_ids = self.tokenizer('\n' + chunk["content"], add_special_tokens=False).input_ids[2:]
+                    # prompt_chunks.append(tmp_input_ids)
+                    label_prompt_chunks.append([-100 for _ in range(len(tmp_input_ids))])
+
+            labels = [-100]
+            for chunk in label_prompt_chunks:
+                labels.extend(chunk)
+            # input_ids = [1]
+            # for chunk in prompt_chunks:
+            #     input_ids.extend(chunk)
+
+            labels = torch.tensor(labels, dtype=torch.long).unsqueeze(0)
+            # with open('tmp/input_ids.txt', 'w') as f:
+            #     print(texts, file=f)
+            #     print(split_with_roles(texts), file=f)
+            #     print("input_ids_before", file=f)
+            #     print(model_inputs['input_ids'][0].tolist(), file=f)
+            #     print("input_ids", file=f)
+            #     print(input_ids, file=f)
+            assert labels.shape[1] == model_inputs['input_ids'].shape[1], f"labels length: {labels.shape[1]}, input_ids length: {model_inputs['input_ids'].shape[1]}"
+            return BatchFeature(data={**model_inputs, "labels": labels})
+
+
+        if 'num_img_tokens' in images:
+            num_img_tokens = images['num_img_tokens']
+        else:
+            assert 'num_crops' in images, 'num_crops must be provided in images if num_img_tokens is not provided'
+            num_crops = images['num_crops']
+            num_img_tokens = [_num_crops * self.num_img_tokens for _num_crops in num_crops]
+
+        images, image_sizes = images['pixel_values'], images['image_sizes']
+
+        pattern = r"<\|image_\d+\|>"
+        # image_tags needs to start from 1 to n
+        image_tags = re.findall(pattern, texts)
+        # image_ids = [int(s.split("|")[1].split("_")[-1]) * -1 for s in image_tags]
+        # image_ids_pad = [[iid]*num_img_tokens[i] for i, iid in enumerate(image_ids)]
+        image_ids = [int(s.split("|")[1].split("_")[-1]) for s in image_tags]
+        unique_image_ids = sorted(list(set(image_ids)))
+        # image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be [1, 4, 5]
+        # check the condition
+        assert unique_image_ids == list(range(1, len(unique_image_ids)+1)), f"image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be {unique_image_ids}"
+        # total images must be the same as the number of image tags
+        assert len(unique_image_ids) == len(images), f"total images must be the same as the number of image tags, got {len(unique_image_ids)} image tags and {len(images)} images"
+
+        image_ids_pad = [[-iid]*num_img_tokens[iid-1] for iid in image_ids]
+
+        prompt_chunks = []
+        label_prompt_chunks = []
+        for chunk in split_with_roles(texts):
+            if chunk["role"] == "assistant" and chunk['type'] in [0, 3]:
+                tmp_input_ids = self.tokenizer(chunk["content"], add_special_tokens=False).input_ids
+                prompt_chunks.append(tmp_input_ids)
+                label_prompt_chunks.append(tmp_input_ids)
+            else:
+                if chunk["content"] == "<|image_1|>":
+                    tmp_input_ids = image_ids_pad.pop(0)
+                else:
+                    tmp_input_ids = self.tokenizer('\n' + chunk["content"], add_special_tokens=False).input_ids[2:]
+                prompt_chunks.append(tmp_input_ids)
+                label_prompt_chunks.append([-100 for _ in range(len(tmp_input_ids))])
+
+        input_ids = [1]
+        labels = [-100]
+        for chunk in prompt_chunks:
+            input_ids.extend(chunk)
+        for chunk in label_prompt_chunks:
+            labels.extend(chunk)
+        input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)
+        labels = torch.tensor(labels, dtype=torch.long).unsqueeze(0)
+        attention_mask = (input_ids > -1000000).to(torch.long)
+
+        return BatchFeature(data={"input_ids": input_ids,
+                                  "attention_mask": attention_mask,
+                                  "pixel_values": images,
+                                  "image_sizes": image_sizes,
+                                  "labels": labels})
+
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt
new file mode 100644
index 00000000000..1322923bd00
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt
@@ -0,0 +1,18 @@
+transformers==4.41.0
+torch
+tiktoken
+transformers_stream_generator
+peft
+sentencepiece
+einops
+accelerate
+datasets
+protobuf
+auto-gptq
+openpyxl
+wandb
+py-cpuinfo
+Pillow
+torchvision
+lm-eval==0.4.4
+setuptools==70.0.0
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
new file mode 100644
index 00000000000..d01f166ff7f
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -x
+device=0
+model_name=microsoft/Phi-3-vision-128k-instruct
+CUDA_VISIBLE_DEVICES=$device \
+python3 main.py \
+--model_name=$model_name \
+--nsamples 512 \
+--model_dtype fp32 \
+--image_folder /PATH/TO/coco/images/train2017 \
+--question_file /PATH/TO/llava_v1_5_mix665k.json \
+--output_dir "./tmp_autoround"
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md
new file mode 100644
index 00000000000..9c9f729e3ee
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md
@@ -0,0 +1,178 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run [VLM quantization for Qwen-VL](https://huggingface.co/Qwen/Qwen-VL) using AutoRound Quantization.
+
+# Run Quantization on Qwen-VL Models
+
+In this example, we introduce an straight-forward way to execute quantization on some popular multimodal models such as Qwen-VL. 
+
+## Download the calibration data
+
+Our calibration process resembles the official visual instruction tuning process.
+
+Please download the annotation of the final mixture our instruction tuning data [llava_v1_5_mix665k.json](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json), and download the images from constituting datasets:
+
+COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip the image folder to any directory you desire.
+
+You can also refer to the official Qwen-VL finetuning requirements to create a [custom dataset](https://github.com/QwenLM/Qwen-VL/blob/master/README.md#data-preparation)
+
+## Download the evaluation data
+
+Please refer to [Qwen-VL evaluation](https://github.com/cognitedata/Qwen-VL-finetune/blob/master/eval_mm/EVALUATION.md)
+<details>
+<summary>TextVQA Data Preparation</summary>
+
+```bash
+mkdir -p data/textvqa && cd data/textvqa
+
+# download images
+wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip && unzip train_val_images.zip
+
+# download annotations and questions
+wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_train.json
+wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val.jsonl
+
+cd ../..
+
+```
+</details>
+
+<br />
+
+<details>
+<summary>ScienceQA Data Preparation</summary>
+
+```bash
+mkdir -p data/scienceqa/images && cd data/scienceqa/images
+
+# download images
+wget https://scienceqa.s3.us-west-1.amazonaws.com/images/test.zip && unzip test.zip
+
+cd ..
+
+# download original questions
+wget https://github.com/lupantech/ScienceQA/blob/main/data/scienceqa/problems.json
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/scienceqa/scienceqa_test_img.jsonl
+
+cd ../..
+
+```
+</details>
+<br />
+
+## 2. Run Examples
+Enter into the examples folder and install requirements
+```bash
+pip install -r requirements.txt
+```
+
+- **Default Settings:**
+```bash
+CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name Qwen/Qwen-VL  --bits 4 --group_size 128
+```
+
+
+## 3. run inference
+
+```python
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  from transformers.generation import GenerationConfig
+  import torch
+  from transformers import set_seed
+  set_seed(1234)
+  from auto_round.auto_quantizer import AutoHfQuantizer
+  quantized_model_path = "./tmp_autoround"
+  tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, trust_remote_code=True)
+  # use bf16
+  model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, bf16=True).eval()
+  # use fp16
+  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, fp16=True).eval()
+  # use cpu only
+  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu", trust_remote_code=True).eval()
+  # use cuda device
+  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda", trust_remote_code=True).eval()
+  query = tokenizer.from_list_format([{'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, \
+      {'text': 'Generate the caption in English with grounding:'}, \
+  ])
+  inputs = tokenizer(query, return_tensors='pt')
+  inputs = inputs.to(model.device)
+  with torch.cuda.amp.autocast(): 
+      pred = model.generate(**inputs)
+  response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+  print(response)
+  # <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
+  image = tokenizer.draw_bbox_on_latest_picture(response)
+  if image:
+    image.save('2.jpg')
+  else:
+    print("no box")
+
+```
+
+
+## 4. Results
+Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Qwen-VL are as follows:
+| Metric         | bf16   | INT4   |
+|:----------------|:--------|:--------|
+| avg            | 0.5628 | 0.5589 |
+| paper-avg      | 0.5603 | 0.5611 |
+| mmlu           | 0.4828 | 0.4639 |
+| lambada_openai | 0.6782 | 0.6664 |
+| hellaswag      | 0.5593 | 0.5487 |
+| winogrande     | 0.6827 | 0.6875 |
+| piqa           | 0.7786 | 0.7748 |
+| truthfulqa_mc1 | 0.2876 | 0.2901 |
+| openbookqa     | 0.2880 | 0.2940 |
+| boolq          | 0.7012 | 0.7318 |
+| arc_easy       | 0.7201 | 0.7327 |
+| arc_challenge  | 0.4249 | 0.4206 |
+| cmmlu          | 0.4798 | 0.4618 |
+| ceval          | 0.4814 | 0.4569 |
+| textVQA        | 0.6402 | 0.6379 |
+| scienceVQA     | 0.6748 | 0.6574 |
+
+## 5. Known Issues
+* 'QWenTokenizer' object has no attribute 'IMAGE_ST'
+
+    When encountering the above error during evaluation or inference with a quantized model, it is due to Qwen-VL being incompatible with higher versions of the transformers. You can refer to this issue and manually comment out lines 227-228 in the 'tokenization_qwen.py' file.
+
+
+* No such file or directory: 'PATH/modeling_qwen.py'
+
+    Due to the particularities of Qwen-VL, even when setting trust_remote_code=True while loading the model, the above error may still occur. Please manually copy the modeling_qwen.py, visual.py, and qwen_generation_utils.py files from the original model path to resolve the issue.
+
+
+## 6. Environment
+
+PyTorch 1.8 or higher version is needed
+
+
+## Reference
+If you find SignRound useful for your research, please cite our paper:
+```bash
+@article{cheng2023optimize,
+  title={Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs},
+  author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao},
+  journal={arXiv preprint arXiv:2309.05516},
+  year={2023}
+}
+```
+
+
+
+
+
+
+
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py
new file mode 100644
index 00000000000..22b51252a56
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py
@@ -0,0 +1,526 @@
+import argparse
+# import sys
+# sys.path.insert(0, '../../..')
+parser = argparse.ArgumentParser()
+import torch
+import os
+import transformers
+# os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+# torch.use_deterministic_algorithms(True, warn_only=True)
+from transformers import set_seed
+import json
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round.utils import convert_dtype_torch2str
+from typing import Dict, Optional, List
+from transformers.trainer_utils import RemoveColumnsCollator
+from transformers.data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from PIL import Image
+from transformers.trainer_pt_utils import LabelSmoother
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+import inspect
+OLD_IMAGE_TOKEN = '<image>'
+DEFAULT_IM_START_TOKEN = '<img>'
+DEFAULT_IM_END_TOKEN = '</img>'
+from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
+                                                    to_device,
+                                                    to_dtype,
+                                                    get_layer_names_in_block,
+                                                    detect_device,
+                                                    run_fn_for_vlm_autoround
+                                                    )
+from neural_compressor.torch.quantization import (AutoRoundConfig,
+                                                    prepare,
+                                                    convert,
+                                                    load)
+
+def DataFormating(raw_data, image_folder=None, model_type='qwen'):
+    for source in raw_data:
+        source_inputs = source['conversations']
+        for sentence in source_inputs:
+            sentence['from'] = sentence['from'].replace('human', 'user')
+            sentence['from'] = sentence['from'].replace('gpt', 'assistant')
+            if OLD_IMAGE_TOKEN in sentence['value']:
+                sentence['value'] = sentence['value'].replace(OLD_IMAGE_TOKEN, '').strip()
+                sentence['value'] = OLD_IMAGE_TOKEN + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+                if 'qwen2' in model_type: # for Qwen2-vl
+                    replace_token = '<|vision_start|><|image_pad|><|vision_end|>'
+                else:
+                    replace_img = os.path.join(image_folder, os.path.basename(source["image"]))
+                    replace_token = DEFAULT_IM_START_TOKEN + replace_img + DEFAULT_IM_END_TOKEN + '\n'
+                sentence["value"] = sentence["value"].replace(OLD_IMAGE_TOKEN, replace_token)
+    return raw_data
+
+
+def qwen2_preprocess(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    max_len: int,
+    system_message: str = "You are a helpful assistant."
+) -> Dict:
+    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
+    im_start = "<|im_start|>"
+    im_end = "<|im_end|>"
+    nl_tokens = '\n'
+    _system = 'system' + nl_tokens
+
+    # Apply prompt templates
+    inputs, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != roles["user"]:
+            source = source[1:]
+
+        text, target = "", None
+        system = im_start + _system + system_message + im_end + nl_tokens
+        text += system
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            _text = role + nl_tokens + \
+                sentence["value"] + im_end + nl_tokens
+            text += _text
+        token_length = len(tokenizer(text).input_ids)
+        if token_length < max_len:
+            text += tokenizer.pad_token * (max_len - token_length)
+        else:
+            text = tokenizer.decode(tokenizer(text).input_ids[:max_len])
+            pass
+        inputs.append(text)
+
+    return inputs
+
+
+def preprocess(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    max_len: int,
+    system_message: str = "You are a helpful assistant."
+) -> Dict:
+    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
+    if 'qwen2' not in model_type:
+        im_start = tokenizer.im_start_id
+        im_end = tokenizer.im_end_id
+    else:
+        im_start = tokenizer('<|im_start|>')
+        im_end = tokenizer('<|im_end|>')
+    nl_tokens = tokenizer('\n').input_ids
+    _system = tokenizer('system').input_ids + nl_tokens
+    _user = tokenizer('user').input_ids + nl_tokens
+    _assistant = tokenizer('assistant').input_ids + nl_tokens
+
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != roles["user"]:
+            source = source[1:]
+
+        input_id, target = [], []
+        system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
+        input_id += system
+        target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
+        assert len(input_id) == len(target)
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            _input_id = tokenizer(role).input_ids + nl_tokens + \
+                tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
+            input_id += _input_id
+            if role == '<|im_start|>user':
+                _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
+            elif role == '<|im_start|>assistant':
+                _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
+                    _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
+            else:
+                raise NotImplementedError
+            target += _target
+        assert len(input_id) == len(target)
+        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
+        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
+        input_ids.append(input_id[:max_len])
+        targets.append(target[:max_len])
+    input_ids = torch.tensor(input_ids, dtype=torch.int)
+    targets = torch.tensor(targets, dtype=torch.int)
+    
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+        attention_mask=input_ids.ne(tokenizer.pad_token_id),
+    )
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer,
+                 max_len: int, image_folder=None, model_type='qwen_vl'):
+        super(LazySupervisedDataset, self).__init__()
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.image_folder = image_folder
+        print("Formatting inputs...Skip in lazy mode")
+        self.raw_data = raw_data
+        self.cached_data_dict = {}
+
+    def __len__(self):
+        return len(self.raw_data)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        if i in self.cached_data_dict:
+            return self.cached_data_dict[i]
+
+        if 'qwen2' not in model_type:
+            ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
+            ret = dict(
+                input_ids=ret["input_ids"][0],
+                labels=ret["labels"][0],
+                attention_mask=ret["attention_mask"][0],
+            )
+        else:
+            texts = qwen2_preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
+            image_path = os.path.join(f"file://{self.image_folder}", os.path.basename(self.raw_data[i]["image"]))
+            image_inputs = fetch_image({'image':image_path})
+            ret = self.tokenizer.processor(
+                text=texts,
+                images=image_inputs,
+                videos=None,
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+            )
+            ret = dict(
+                input_ids=ret["input_ids"][0],
+                # labels=ret["labels"][0],
+                attention_mask=ret["attention_mask"][0],
+                image_grid_thw=ret["image_grid_thw"][0],
+                pixel_values=ret["pixel_values"],
+            )
+        self.cached_data_dict[i] = ret
+
+        return ret
+
+
+def set_signature_columns_if_needed(model):
+    # Inspect model forward signature to keep only the arguments it accepts.
+    model_to_inspect = model
+    signature = inspect.signature(model_to_inspect.forward)
+    signature_columns = list(signature.parameters.keys())
+    # Labels may be named label or label_ids, the default data collator handles that.
+    signature_columns += list(set(["label", "label_ids", 'labels']))
+    return signature_columns
+    
+def get_collator_with_removed_columns(model, data_collator: Callable, description: Optional[str] = None
+    ) -> Callable:
+        """Wrap the data collator in a callable removing unused columns."""
+        signature_columns = set_signature_columns_if_needed(model)
+
+        remove_columns_collator = RemoveColumnsCollator(
+            data_collator=data_collator,
+            signature_columns=signature_columns,
+            description=description,
+            model_name=model.__class__.__name__,
+        )
+        return remove_columns_collator
+
+
+def get_train_dataloader(train_dataset, model, data_collator=default_data_collator,
+                         train_batch_size=1, num_workers=0) -> DataLoader:
+    """
+    Returns the training [`~torch.utils.data.DataLoader`].
+
+    Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
+    training if necessary) otherwise.
+
+    Subclass and override this method if you want to inject some custom behavior.
+    """
+    if train_dataset is None:
+        raise ValueError("Trainer: training requires a train_dataset.")
+    
+    if data_collator != default_data_collator:
+        data_collator = get_collator_with_removed_columns(model, data_collator, description="training")
+
+    dataloader_params = {
+        "batch_size": train_batch_size,
+        "collate_fn": data_collator,
+        "num_workers": num_workers,
+    }
+
+    return DataLoader(train_dataset, **dataloader_params)
+
+if __name__ == '__main__':
+
+    parser.add_argument(
+        "--model_name", default="Qwen/Qwen-VL"
+    )
+
+    parser.add_argument("--quantize", action="store_true")
+    
+    parser.add_argument("--accuracy", action="store_true")
+    
+    parser.add_argument("--bits", default=4, type=int,
+                        help="number of  bits")
+
+    parser.add_argument("--group_size", default=128, type=int,
+                        help="group size")
+
+    parser.add_argument("--train_bs", default=1, type=int,
+                        help="train batch size")
+
+    parser.add_argument("--eval_bs", default=4, type=int,
+                        help="eval batch size")
+
+    parser.add_argument("--device", default="auto", type=str,
+                        help="The device to be used for tuning. The default is set to auto/None,"
+                             "allowing for automatic detection. Currently, device settings support CPU, GPU, and HPU.")
+
+    parser.add_argument("--sym", action='store_true',
+                        help=" sym quantization")
+
+    parser.add_argument("--iters", default=200, type=int,
+                        help=" iters")
+
+    parser.add_argument("--lr", default=None, type=float,
+                        help="learning rate, if None, it will be set to 1.0/iters automatically")
+
+    parser.add_argument("--minmax_lr", default=None, type=float,
+                        help="minmax learning rate, if None,it will beset to be the same with lr")
+
+    parser.add_argument("--seed", default=42, type=int,
+                        help="seed")
+
+    parser.add_argument("--adam", action='store_true',
+                        help="adam")
+
+    parser.add_argument("--seqlen", default=512, type=int,
+                        help="sequence length")
+
+    parser.add_argument("--gradient_accumulate_steps", default=8, type=int, help="gradient accumulate steps")
+
+    parser.add_argument("--nblocks", default=1, type=int, help="num of blocks to tune together")
+
+    parser.add_argument("--nsamples", default=512, type=int,
+                        help="number of samples")
+
+    parser.add_argument("--export_format", default='auto_round:gptq', type=str,
+                        help="targeted inference acceleration platform,The options are 'fake', 'cpu', 'gpu', 'xpu' and 'auto_round'."
+                             "default to 'fake', indicating that it only performs fake quantization and won't be exported to any device.")
+
+    parser.add_argument("--scale_dtype", default='fp16',
+                        help="which scale data type to use for quantization, 'fp16', 'fp32' or 'bf16'.")
+
+    parser.add_argument("--output_dir", default="./tmp_autoround", type=str,
+                        help="Where to store the final model.")
+
+    parser.add_argument("--disable_eval", action='store_true',
+                        help="Whether to do lmeval evaluation.")
+
+    parser.add_argument("--disable_amp", action='store_true',
+                        help="disable amp")
+
+    parser.add_argument("--disable_minmax_tuning", action='store_true',
+                        help="whether disable  enable weight minmax tuning")
+
+    parser.add_argument("--disable_trust_remote_code", action='store_true',
+                        help="Whether to disable trust_remote_code")
+
+    parser.add_argument("--disable_quanted_input", action='store_true',
+                        help="whether to disuse the output of quantized block to tune the next block")
+
+    parser.add_argument("--quant_lm_head", action='store_true',
+                        help="quant_lm_head")
+
+    parser.add_argument("--model_dtype", default=None, type=str,
+                        help="force to convert the dtype, some backends supports fp16 dtype better")
+    
+    parser.add_argument("--model_max_length", default=2048, type=int,
+                        help="")
+    
+    parser.add_argument("--act_bits", default=32, type=int,
+                    help="activation bits")
+    
+    parser.add_argument("--quant_vision", action='store_true',
+                        help="To determine whether the quantization should handle vision component.")
+    
+    # ========== Calibration Datasets ============= 
+    parser.add_argument("--image_folder", default="coco", type=str,
+                        help="The dataset for quantization training. It can be a custom one.")
+    
+    parser.add_argument("--question_file", default=None, type=str,
+                            help="The dataset for quantization training. It can be a custom one.")
+    
+    # ================= Evaluation Related =====================
+    # parser.add_argument("--eval-path", type=str, default=None)
+    
+    parser.add_argument("--eval_dataset", type=str, default="textvqa_val,scienceqa_test_img")
+
+    args = parser.parse_args()
+
+    set_seed(args.seed)
+    
+
+    if args.quantize:
+        if args.act_bits <= 8:
+            print(
+                "Warning, activation quantization is an experiment feature")
+        
+        if args.act_bits <= 8 and args.export_format != "fake":
+            assert False, "only support fake mode for activation quantization currently"
+            
+        if "marlin" in args.export_format and args.sym == False:
+            assert False, "marlin backend only supports sym quantization, please set --sym"
+            
+        model_name = args.model_name
+        if model_name[-1] == "/":
+            model_name = model_name[:-1]
+        print(model_name, flush=True)
+
+        device_str = detect_device(args.device)
+        torch_device = torch.device(device_str)
+        
+        model_name = args.model_name
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code,
+                                                padding_side="right", use_fast=False)
+        seqlen = args.seqlen
+        if hasattr(tokenizer, "model_max_length"):
+            if tokenizer.model_max_length < seqlen:
+                print(f"change sequence length to {tokenizer.model_max_length} due to the limitation of model_max_length",
+                    flush=True)
+                seqlen = min(seqlen, tokenizer.model_max_length)
+                args.seqlen = seqlen
+                
+        torch_dtype = "auto"
+        if "hpu" in device_str:
+            torch_dtype = torch.bfloat16 ## TODO test on hpu
+        if args.model_dtype != None:
+            if args.model_dtype == "float16" or args.model_dtype == "fp16":
+                torch_dtype = torch.float16
+            if args.model_dtype == "bfloat16" or args.model_dtype == "bf16":
+                torch_dtype = torch.bfloat16
+                
+        dtype_str = convert_dtype_torch2str(torch_dtype)
+        questions = json.load(open(args.question_file, "r"))
+        config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+        model_type = config.model_type
+        if 'qwen2' not in model_type: # for Qwen-VL/Qwen-VL-Chat
+            tokenizer.pad_token_id = tokenizer.eod_id
+            config.use_cache = False
+            if dtype_str == "bf16":
+                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code, bf16=True).eval()
+            elif dtype_str == "fp16":
+                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code, fp16=True).eval()
+            else:
+                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code).eval()
+            # raw_data = DataFormating(questions, args.image_folder)
+            default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
+        else: # for Qwen2-VL-instruct
+            transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
+            if transformers_version[0] == 4 and transformers_version[1] < 45:
+                error_message = "Please upgrade transformers to version >= 4.45 or the newest source code to support lm-head quantization."
+                raise EnvironmentError(error_message)
+            from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+            from qwen_vl_utils import process_vision_info, fetch_image
+            model = Qwen2VLForConditionalGeneration.from_pretrained(args.model_name, torch_dtype=torch_dtype)
+            processor = AutoProcessor.from_pretrained(args.model_name)
+            tokenizer.processor = processor
+            default_collator = default_data_collator
+            
+        raw_data = DataFormating(questions, args.image_folder, model_type=model_type)
+        dataset = LazySupervisedDataset(raw_data, tokenizer,
+                                        max_len=min(args.seqlen, tokenizer.model_max_length), image_folder=args.image_folder)
+        dataloader = get_train_dataloader(dataset, model, data_collator=default_collator, train_batch_size=args.train_bs)
+        
+        
+        model = model.eval()
+        seqlen = args.seqlen
+                    
+        lm_head_layer_name = "lm_head"
+        # for n, _ in model.named_modules():
+        #     lm_head_layer_name = n
+        if args.quant_lm_head:
+            from transformers import AutoConfig
+            config = AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+            if config.tie_word_embeddings and hasattr(model, "_tied_weights_keys"):
+                tied_keys = model._tied_weights_keys
+                for item in tied_keys:
+                    if lm_head_layer_name in item:  ##TODO extend to encoder-decoder layer, seq classification model
+                        args.quant_lm_head = False
+                        print(
+                            f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been "
+                            f"supported currently")
+                        break
+            
+        quant_block_list = get_multimodal_block_names(model, args.quant_vision)
+        
+        # dataset=dataloader, layer_config=, amp, 
+        quant_config = AutoRoundConfig(bits=args.bits, use_sym=args.sym, batch_size=args.train_bs, group_size=args.group_size,
+                        seqlen=seqlen, nblocks=args.nblocks, iters=args.iters, lr=args.lr,
+                        minmax_lr=args.minmax_lr, enable_quanted_input=not args.disable_quanted_input,
+                        nsamples=args.nsamples, seed=args.seed, gradient_accumulate_steps=args.gradient_accumulate_steps,
+                        scale_dtype=args.scale_dtype, enable_minmax_tuning=not args.disable_minmax_tuning, act_bits=args.act_bits,
+                        quant_block_list=quant_block_list, export_format=args.export_format)
+        
+        all_block_list = get_multimodal_block_names(model, quant_vision=True)
+        all_block_set = set(tuple(block) for block in all_block_list)
+        quant_block_set = set(tuple(block) for block in quant_block_list)
+        set_to_full_prec = list(all_block_set - quant_block_set)
+        set_to_full_prec = get_layer_names_in_block(model, quant_block_list=set_to_full_prec)
+        for name in set_to_full_prec:
+            quant_config.set_local(name, AutoRoundConfig(dtype="fp32"))
+        
+        for n, m in model.named_modules():
+            if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
+                if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
+                    quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
+                    print(
+                        f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
+        # skip special layers
+        quant_config.set_local("transformer.visual.attn_pool.*_proj", AutoRoundConfig(dtype="fp32"))
+
+        if not args.quant_lm_head:
+            quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
+            transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
+            if transformers_version[0] == 4 and transformers_version[1] < 38:
+                error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
+                raise EnvironmentError(error_message)
+            
+        run_args = (dataloader, seqlen, args.nsamples)
+        user_model = prepare(model=model, quant_config=quant_config)
+        run_fn_for_vlm_autoround(user_model, *run_args)
+        user_model = convert(user_model)
+
+        # user_model.save(args.output_dir, format="huggingface")
+        from neural_compressor.torch.utils import (LoadFormat,)
+        user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(args.output_dir)
+
+    if args.accuracy:  ## TODO
+        # model = model.half()
+        model_name = args.model_name
+        device_str = detect_device(args.device)
+        torch_device = torch.device(device_str)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code,
+                                                  padding_side="right", use_fast=False)
+        model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code)
+        model = model.to(torch_device)
+        datasets=args.eval_dataset.split(',')
+        for dataset in datasets:
+            if 'vqa' in dataset:
+                from mm_evaluation.evaluate_vqa import textVQA_evaluation
+                evaluator = textVQA_evaluation(
+                    model,
+                    dataset_name=dataset,
+                    tokenizer=tokenizer,
+                    batch_size=args.eval_bs,
+                    device=str(torch_device)
+                )
+            elif 'scienceqa' in dataset:
+                from mm_evaluation.evaluate_multiple_choice import scienceQA_evaluation
+                evaluator = scienceQA_evaluation(
+                    model,
+                    dataset_name=dataset,
+                    tokenizer=tokenizer,
+                    batch_size=args.eval_bs,
+                    device=str(torch_device)
+                )
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/__init__.py
new file mode 100644
index 00000000000..01913bdfeb9
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/__init__.py
@@ -0,0 +1,4 @@
+# if __name__ == "__main__":
+#     import sys
+#     sys.path.insert(0, './')
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py
new file mode 100644
index 00000000000..11c8944072e
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py
@@ -0,0 +1,216 @@
+import argparse
+import itertools
+import json
+import os
+from functools import partial
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+multiple_choices = ['A', 'B', 'C', 'D', 'E']
+
+ds_collections = {
+    'scienceqa_test_img': {
+        'test': 'data/scienceqa/scienceqa_test_img.jsonl',
+    }
+}
+
+
+def collate_fn(batches, pad_token_id):
+
+    input_tokens = [_['input_tokens'] for _ in batches]
+    target_lengths = [_['target_lengths'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+
+    chunk_sizes = [len(_) for _ in input_tokens]
+
+    input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
+
+    max_lengths = max([len(_) for _ in input_tokens])
+    input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
+                    for _ in input_tokens]
+    input_tokens = torch.LongTensor(input_tokens)
+
+    attention_mask = 1 - input_tokens.eq(pad_token_id).float()
+
+    return input_tokens, attention_mask, target_lengths, answers, chunk_sizes
+
+
+class MultipleChoiceDataste(torch.utils.data.Dataset):
+
+    def __init__(self, test, prompt, tokenizer):
+        self.data = open(test).readlines()
+        self.prompt = prompt
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+
+        data = json.loads(self.data[idx].strip())
+        image = data['image']
+        hint = data['hint'] if data['hint'] else 'N/A'
+        question = data['question']
+
+        choices = data['choices']
+        choice_list = []
+        for i, c in enumerate(choices):
+            choice_list.append('{}. {}'.format(multiple_choices[i], c))
+        choice_txt = '\n'.join(choice_list)
+
+        prompt = self.prompt.format(image, hint, question, choice_txt)
+
+        prompt_tokens = self.tokenizer(prompt).input_ids
+        target_tokens = [
+            self.tokenizer(' ' + _).input_ids
+            for _ in multiple_choices[:len(choices)]
+        ]
+
+        return {
+            'input_tokens': [prompt_tokens + _ for _ in target_tokens],
+            'target_lengths': [len(_) for _ in target_tokens],
+            'answer': data['answer'],
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+def scienceQA_evaluation(model_name, dataset_name, dataset_path=None, tokenizer=None,
+                       batch_size=1, few_shot=0, seed=0, trust_remote_code=True, device="cuda:0"):
+    # torch.distributed.init_process_group(
+    #     backend='nccl',
+    #     world_size=int(os.getenv('WORLD_SIZE', '1')),
+    #     rank=int(os.getenv('RANK', '0')),
+    # )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    if isinstance(model_name, str):
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=trust_remote_code).eval()
+        model = model.to(torch.device(device))
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code, use_fast=False)
+    else:
+        assert tokenizer is not None, "Two types of parameter passing are supported:model_path or model with tokenizer."
+        model = model_name
+
+    prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
+
+    dataset = MultipleChoiceDataste(test=ds_collections[dataset_name]['test'],
+                                    prompt=prompt,
+                                    tokenizer=tokenizer)
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        # sampler=InferenceSampler(len(dataset)),
+        batch_size=batch_size,
+        # num_workers=0,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
+    )
+
+    results = []
+    with torch.no_grad():
+        for _, (input_tokens, attention_mask, target_lengths, answer,
+                chunk_sizes) in tqdm(enumerate(dataloader)):
+
+            outputs = model(
+                input_ids=input_tokens[:, :-1].cuda(),
+                attention_mask=attention_mask[:, :-1].cuda(),
+                return_dict=True,
+            )
+            losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
+                0, 2, 1),
+                                                       input_tokens[:,
+                                                                    1:].cuda(),
+                                                       reduction='none')
+
+            losses = losses.split(chunk_sizes, dim=0)
+
+            for loss, target_length, answer in zip(losses, target_lengths,
+                                                   answer):
+
+                target_loss = loss.mean(-1)
+                for _ in range(len(target_length)):
+                    target_loss[_] = loss[_, -target_length[_]:].mean()
+                pred = target_loss.argmin().item()
+                if pred == answer:
+                    results.append(1)
+                else:
+                    results.append(0)
+
+    # torch.distributed.barrier()
+
+    # world_size = torch.distributed.get_world_size()
+    # merged_results = [None for _ in range(world_size)]
+    # torch.distributed.all_gather_object(merged_results, results)
+    merged_results = [json.dumps(results)]
+    merged_results = [json.loads(_) for _ in merged_results]
+    merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
+
+    # if torch.distributed.get_rank() == 0:
+    print(f"Evaluating {dataset_name} ...")
+    print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
+
+    # torch.distributed.barrier()
+
+
+
+
+if __name__ == "__main__":
+    import sys
+    import time
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", default="Qwen/Qwen-VL"
+    )
+    parser.add_argument(
+        "--dataset_name", default="scienceqa_test_img"
+    )
+    parser.add_argument(
+        "--eval_bs", default=4,
+    )
+    parser.add_argument(
+        "--trust_remote_code", action='store_true',
+        help="Whether to enable trust_remote_code"
+    )
+    args = parser.parse_args()
+    s = time.time()
+    evaluator = scienceQA_evaluation(
+        args.model_name,
+        dataset_name=args.dataset_name,
+        # dataset_path=args.eval_path,
+        batch_size=args.eval_bs,
+        trust_remote_code=args.trust_remote_code
+    )
+    print("cost time: ", time.time() - s)
+
+    
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_vqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_vqa.py
new file mode 100644
index 00000000000..a6192af4d5a
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_vqa.py
@@ -0,0 +1,464 @@
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+from typing import Optional
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from .vqa import VQA
+from .vqa_eval import VQAEval
+
+# This code is much refer to https://github.com/cognitedata/Qwen-VL-finetune/blob/master/eval_mm/evaluate_vqa.py
+
+ds_collections = {
+    'vqav2_val': {
+        'train': 'data/vqav2/vqav2_train.jsonl',
+        'test': 'data/vqav2/vqav2_val.jsonl',
+        'question': 'data/vqav2/v2_OpenEnded_mscoco_val2014_questions.json',
+        'annotation': 'data/vqav2/v2_mscoco_val2014_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vqav2_testdev': {
+        'train': 'data/vqav2/vqav2_train.jsonl',
+        'test': 'data/vqav2/vqav2_testdev.jsonl',
+        'metric': None,
+        'max_new_tokens': 10,
+    },
+    'okvqa_val': {
+        'train': 'data/okvqa/okvqa_train.jsonl',
+        'test': 'data/okvqa/okvqa_val.jsonl',
+        'question': 'data/okvqa/OpenEnded_mscoco_val2014_questions.json',
+        'annotation': 'data/okvqa/mscoco_val2014_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'textvqa_val': {
+        'train': 'data/textvqa/textvqa_train.jsonl',
+        'test': 'data/textvqa/textvqa_val.jsonl',
+        'question': 'data/textvqa/textvqa_val_questions.json',
+        'annotation': 'data/textvqa/textvqa_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vizwiz_val': {
+        'train': 'data/vizwiz/vizwiz_train.jsonl',
+        'test': 'data/vizwiz/vizwiz_val.jsonl',
+        'question': 'data/vizwiz/vizwiz_val_questions.json',
+        'annotation': 'data/vizwiz/vizwiz_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vizwiz_test': {
+        'train': 'data/vizwiz/vizwiz_train.jsonl',
+        'test': 'data/vizwiz/vizwiz_test.jsonl',
+        'metric': None,
+        'max_new_tokens': 10,
+    },
+    'docvqa_val': {
+        'train': 'data/docvqa/train.jsonl',
+        'test': 'data/docvqa/val.jsonl',
+        'annotation': 'data/docvqa/val/val_v1.0.json',
+        'metric': 'anls',
+        'max_new_tokens': 100,
+    },
+    'docvqa_test': {
+        'train': 'data/docvqa/train.jsonl',
+        'test': 'data/docvqa/test.jsonl',
+        'metric': None,
+        'max_new_tokens': 100,
+    },
+    'chartqa_test_human': {
+        'train': 'data/chartqa/train_human.jsonl',
+        'test': 'data/chartqa/test_human.jsonl',
+        'metric': 'relaxed_accuracy',
+        'max_new_tokens': 100,
+    },
+    'chartqa_test_augmented': {
+        'train': 'data/chartqa/train_augmented.jsonl',
+        'test': 'data/chartqa/test_augmented.jsonl',
+        'metric': 'relaxed_accuracy',
+        'max_new_tokens': 100,
+    },
+    'gqa_testdev': {
+        'train': 'data/gqa/train.jsonl',
+        'test': 'data/gqa/testdev_balanced.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    },
+    'ocrvqa_val': {
+        'train': 'data/ocrvqa/ocrvqa_train.jsonl',
+        'test': 'data/ocrvqa/ocrvqa_val.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 100,
+    },
+    'ocrvqa_test': {
+        'train': 'data/ocrvqa/ocrvqa_train.jsonl',
+        'test': 'data/ocrvqa/ocrvqa_test.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 100,
+    },
+    'ai2diagram_test': {
+        'train': 'data/ai2diagram/train.jsonl',
+        'test': 'data/ai2diagram/test.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    }
+}
+
+# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
+def relaxed_correctness(target: str,
+                        prediction: str,
+                        max_relative_change: float = 0.05) -> bool:
+    """Calculates relaxed correctness.
+
+    The correctness tolerates certain error ratio defined by max_relative_change.
+    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+    numeric answers to allow a minor inaccuracy that may result from the automatic
+    data extraction process. We consider an answer to be correct if it is within
+    5% of the gold answer. For non-numeric answers, we still need an exact match
+    to consider an answer to be correct.”
+
+    Args:
+      target: Target string.
+      prediction: Predicted string.
+      max_relative_change: Maximum relative change.
+
+    Returns:
+      Whether the prediction was correct given the specified tolerance.
+    """
+
+    def _to_float(text: str) -> Optional[float]:
+        try:
+            if text.endswith('%'):
+                # Convert percentages to floats.
+                return float(text.rstrip('%')) / 100.0
+            else:
+                return float(text)
+        except ValueError:
+            return None
+
+    prediction_float = _to_float(prediction)
+    target_float = _to_float(target)
+    if prediction_float is not None and target_float:
+        relative_change = abs(prediction_float -
+                              target_float) / abs(target_float)
+        return relative_change <= max_relative_change
+    else:
+        return prediction.lower() == target.lower()
+
+
+def evaluate_relaxed_accuracy(entries):
+    scores = []
+    for elem in entries:
+        if isinstance(elem['annotation'], str):
+            elem['annotation'] = [elem['annotation']]
+        score = max([
+            relaxed_correctness(elem['answer'].strip(), ann)
+            for ann in elem['annotation']
+        ])
+        scores.append(score)
+    return sum(scores) / len(scores)
+
+
+def evaluate_exact_match_accuracy(entries):
+    scores = []
+    for elem in entries:
+        if isinstance(elem['annotation'], str):
+            elem['annotation'] = [elem['annotation']]
+        score = max([
+            (1.0 if
+             (elem['answer'].strip().lower() == ann.strip().lower()) else 0.0)
+            for ann in elem['annotation']
+        ])
+        scores.append(score)
+    return sum(scores) / len(scores)
+
+
+def collate_fn(batches, tokenizer):
+
+    questions = [_['question'] for _ in batches]
+    question_ids = [_['question_id'] for _ in batches]
+    annotations = [_['annotation'] for _ in batches]
+
+    input_ids = tokenizer(questions, return_tensors='pt', padding='longest')
+
+    return question_ids, input_ids.input_ids, input_ids.attention_mask, annotations
+
+
+class VQADataset(torch.utils.data.Dataset):
+
+    def __init__(self, train, test, prompt, few_shot):
+        self.test = open(test).readlines()
+        self.prompt = prompt
+
+        self.few_shot = few_shot
+        if few_shot > 0:
+            self.train = open(train).readlines()
+
+    def __len__(self):
+        return len(self.test)
+
+    def __getitem__(self, idx):
+        data = json.loads(self.test[idx].strip())
+        image, question, question_id, annotation = data['image'], data[
+            'question'], data['question_id'], data.get('answer', None)
+
+        few_shot_prompt = ''
+        if self.few_shot > 0:
+            few_shot_samples = random.sample(self.train, self.few_shot)
+            for sample in few_shot_samples:
+                sample = json.loads(sample.strip())
+                few_shot_prompt += self.prompt.format(
+                    sample['image'],
+                    sample['question']) + f" {sample['answer']}"
+
+        return {
+            'question': few_shot_prompt + self.prompt.format(image, question),
+            'question_id': question_id,
+            'annotation': annotation
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+def textVQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", dataset_path=None, tokenizer=None,
+                       batch_size=1, few_shot=0, seed=0, trust_remote_code=True, device="cuda:0"):
+    # torch.distributed.init_process_group(
+    #     backend='nccl',
+    #     world_size=int(os.getenv('WORLD_SIZE', '1')),
+    #     rank=int(os.getenv('RANK', '0')),
+    # )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    if isinstance(model_name, str):
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=trust_remote_code).eval()
+        model = model.to(torch.device(device))
+        tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=trust_remote_code,
+                                              padding_side="right", use_fast=False)
+    else:
+        assert tokenizer is not None, "Two types of parameter passing are supported:model_path or model with tokenizer."
+        model = model_name
+    
+    tokenizer.padding_side = 'left'
+    tokenizer.pad_token_id = tokenizer.eod_id
+
+    prompt = '<img>{}</img>{} Answer:'
+    if dataset_path is not None:
+        for key in ds_collections[dataset_name].keys():
+            if isinstance(ds_collections[dataset_name][key], str) and "json" in ds_collections[dataset_name][key]:
+                ds_collections[dataset_name][key] = os.path.join(dataset_path,ds_collections[dataset_name][key])
+            
+    random.seed(seed)
+    dataset = VQADataset(
+        train=ds_collections[dataset_name]['train'],
+        test=ds_collections[dataset_name]['test'],
+        prompt=prompt,
+        few_shot=few_shot,
+    )
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        # sampler=InferenceSampler(len(dataset)),
+        batch_size=batch_size,
+        # num_workers=0,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, tokenizer=tokenizer),
+    )
+
+    outputs = []
+    for _, (question_ids, input_ids, attention_mask,
+            annotations) in tqdm(enumerate(dataloader)):
+        pred = model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=ds_collections[dataset_name]['max_new_tokens'],
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id,
+        )
+        answers = [
+            tokenizer.decode(_[input_ids.size(1):].cpu(),
+                             skip_special_tokens=True).strip() for _ in pred
+        ]
+
+        for question_id, answer, annotation in zip(question_ids, answers,
+                                                   annotations):
+            if dataset_name in ['vqav2_val', 'vqav2_testdev', 'okvqa_val', 'textvqa_val', 'vizwiz_val']:
+                outputs.append({
+                    'question_id': question_id,
+                    'answer': answer,
+                })
+            elif dataset_name in ['docvqa_val', 'infographicsvqa', 'gqa_testdev', 'ocrvqa_val', 'ocrvqa_test']:
+                outputs.append({
+                    'questionId': question_id,
+                    'answer': answer,
+                    'annotation': annotation,
+                })
+            elif dataset_name in ['ai2diagram_test']:
+                outputs.append({
+                    'image': question_id,
+                    'answer': answer,
+                    'annotation': annotation,
+                })
+            elif dataset in ['chartqa_test_human', 'chartqa_test_augmented']:
+                outputs.append({
+                    'answer': answer,
+                    'annotation': annotation,
+                })
+            elif dataset_name in ['docvqa_test']:
+                outputs.append({
+                    'questionId': question_id,
+                    'answer': answer,
+                })
+            elif dataset_name in ['vizwiz_test']:
+                outputs.append({
+                    'image': question_id,
+                    'answer': answer,
+                })
+            else:
+                raise NotImplementedError
+
+    # torch.distributed.barrier()
+
+    # world_size = torch.distributed.get_world_size()
+    # merged_outputs = [None for _ in range(world_size)]
+    # torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+
+    merged_outputs = [json.dumps(outputs)]
+    merged_outputs = [json.loads(_) for _ in merged_outputs]
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+    # if torch.distributed.get_rank() == 0:
+    print(f"Evaluating {dataset_name} ...")
+    time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+    results_file = f'{dataset}_{time_prefix}_fs{few_shot}_s{seed}.json'
+    json.dump(merged_outputs, open(results_file, 'w'), ensure_ascii=False)
+
+    if ds_collections[dataset_name]['metric'] == 'vqa_score':
+        vqa = VQA(ds_collections[dataset_name]['annotation'],
+                    ds_collections[dataset_name]['question'])
+        results = vqa.loadRes(
+            resFile=results_file,
+            quesFile=ds_collections[dataset_name]['question'])
+        vqa_scorer = VQAEval(vqa, results, n=2)
+        vqa_scorer.evaluate()
+
+        print(vqa_scorer.accuracy)
+
+    elif ds_collections[dataset_name]['metric'] == 'anls':
+        json.dump(merged_outputs,
+                    open(results_file, 'w'),
+                    ensure_ascii=False)
+        print('python infographicsvqa_eval.py -g ' +
+                ds_collections[dataset_name]['annotation'] + ' -s ' +
+                results_file)
+        os.system('python infographicsvqa_eval.py -g ' +
+                    ds_collections[dataset_name]['annotation'] + ' -s ' +
+                    results_file)
+    elif ds_collections[dataset_name]['metric'] == 'relaxed_accuracy':
+        print({
+            'relaxed_accuracy': evaluate_relaxed_accuracy(merged_outputs)
+        })
+    elif ds_collections[dataset_name]['metric'] == 'accuracy':
+        if 'gqa' in dataset:
+            for entry in merged_outputs:
+                response = entry['answer']
+                response = response.strip().split('.')[0].split(
+                    ',')[0].split('!')[0].lower()
+                if 'is ' in response:
+                    response = response.split('is ')[1]
+                if 'are ' in response:
+                    response = response.split('are ')[1]
+                if 'a ' in response:
+                    response = response.split('a ')[1]
+                if 'an ' in response:
+                    response = response.split('an ')[1]
+                if 'the ' in response:
+                    response = response.split('the ')[1]
+                if ' of' in response:
+                    response = response.split(' of')[0]
+                response = response.strip()
+                entry['answer'] = response
+        print({'accuracy': evaluate_exact_match_accuracy(merged_outputs)})
+
+    # torch.distributed.barrier()
+    
+    
+    
+
+if __name__ == "__main__":
+    import sys
+
+    import time
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", default="Qwen/Qwen-VL"
+    )
+    parser.add_argument(
+        "--base_model", default="Qwen/Qwen-VL"
+    )
+    parser.add_argument(
+        "--dataset_name", default="textvqa_val"
+    )
+    parser.add_argument(
+        "--eval_bs", default=4,
+    )
+    parser.add_argument(
+        "--trust_remote_code", action='store_true',
+        help="Whether to enable trust_remote_code"
+    )
+    args = parser.parse_args()
+    s = time.time()
+    evaluator = textVQA_evaluation(
+        args.model_name,
+        base_model=args.base_model,
+        dataset_name=args.dataset_name,
+        # dataset_path=args.eval_path,
+        batch_size=args.eval_bs,
+        trust_remote_code=args.trust_remote_code
+    )
+    print("cost time: ", time.time() - s)
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/main.py
new file mode 100644
index 00000000000..11668d5a930
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/main.py
@@ -0,0 +1,101 @@
+
+if __name__ == "__main__":
+
+    import sys
+
+    sys.path.insert(0, '../../../')
+    import time
+    import torch
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", default="/models/opt-125m/"
+    )
+    parser.add_argument(
+        "--eval_bs", default=4, type=int,
+    )
+    parser.add_argument(
+        "--trust_remote_code", action='store_true',
+        help="Whether to enable trust_remote_code"
+    )
+    parser.add_argument(
+        "--device", default="cuda:0",
+        help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation."
+    )
+    parser.add_argument(
+        "--base_model", default="Qwen/Qwen-VL"
+    )
+    parser.add_argument(
+        "--model_dtype", default=None, type=str,
+        help="force to convert the dtype, some backends supports fp16 dtype better"
+    )
+    parser.add_argument(
+        "--tasks",
+        default="textvqa_val,scienceqa_test_img",
+        help="lm-eval tasks for lm_eval version 0.4.2"
+    )
+
+    args = parser.parse_args()
+    s = time.time()
+    from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
+    from auto_round.utils import convert_dtype_torch2str
+
+    config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=args.trust_remote_code)
+
+    if hasattr(config, "quantization_config"):
+        quantization_config = config.quantization_config
+        if "quant_method" in quantization_config and "auto-round" in quantization_config["quant_method"]:
+            from auto_round.auto_quantizer import AutoHfQuantizer
+        elif "quant_method" in quantization_config and quantization_config["quant_method"] == "gptq":
+            if args.device == "hpu":
+                from auto_round.auto_quantizer import AutoHfQuantizer
+    model_name = args.model_name
+    torch_dtype = torch.float
+    if args.model_dtype != None:
+        if args.model_dtype == "float16" or args.model_dtype == "fp16":
+            torch_dtype = torch.float16
+        if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
+            torch_dtype = torch.bfloat16
+    dtype_str = convert_dtype_torch2str(torch_dtype)
+    if dtype_str == "bf16":
+        model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=args.trust_remote_code, device_map=args.device, bf16=True).eval()
+    elif dtype_str == "fp16":
+        model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=args.trust_remote_code, device_map=args.device, fp16=True).eval()
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=args.trust_remote_code, device_map=args.device).eval()
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=args.trust_remote_code, padding_side="right", use_fast=False)
+    tokenizer.pad_token_id = tokenizer.eod_id
+    test_tasks = args.tasks
+    if isinstance(test_tasks, str):
+        test_tasks = test_tasks.split(',')
+
+    for dataset in test_tasks:
+        if 'vqa' in dataset:
+            from evaluate_vqa import textVQA_evaluation
+            with torch.cuda.amp.autocast():
+                evaluator = textVQA_evaluation(
+                    model,
+                    dataset_name=dataset,
+                    # dataset_path=args.eval_path,
+                    tokenizer=tokenizer,
+                    batch_size=args.eval_bs,
+                    trust_remote_code=args.trust_remote_code,
+                    device=str(args.device)
+                )
+        elif 'scienceqa' in dataset:
+            from evaluate_multiple_choice import scienceQA_evaluation
+            with torch.cuda.amp.autocast():
+                evaluator = scienceQA_evaluation(
+                    model,
+                    dataset_name=dataset,
+                    # dataset_path=args.eval_path,
+                    tokenizer=tokenizer,
+                    batch_size=args.eval_bs,
+                    trust_remote_code=args.trust_remote_code,
+                    device=str(args.device)
+                )
+
+    print("cost time: ", time.time() - s)
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa.py
new file mode 100644
index 00000000000..d3b17d00903
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa.py
@@ -0,0 +1,206 @@
+"""Copyright (c) 2022, salesforce.com, inc.
+
+All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+__author__ = 'aagrawal'
+__version__ = '0.9'
+
+# Interface for accessing the VQA dataset.
+
+# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
+# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
+
+# The following functions are defined:
+#  VQA        - VQA class that loads VQA annotation file and prepares data structures.
+#  getQuesIds - Get question ids that satisfy given filter conditions.
+#  getImgIds  - Get image ids that satisfy given filter conditions.
+#  loadQA     - Load questions and answers with the specified question ids.
+#  showQA     - Display the specified questions and answers.
+#  loadRes    - Load result file and create result object.
+
+# Help on each function can be accessed by: "help(COCO.function)"
+
+import copy
+import datetime
+import json
+
+
+class VQA:
+
+    def __init__(self, annotation_file=None, question_file=None):
+        """Constructor of VQA helper class for reading and visualizing
+        questions and answers.
+
+        :param annotation_file (str): location of VQA annotation file
+        :return:
+        """
+        # load dataset
+        self.dataset = {}
+        self.questions = {}
+        self.qa = {}
+        self.qqa = {}
+        self.imgToQA = {}
+        if not annotation_file == None and not question_file == None:
+            print('loading VQA annotations and questions into memory...')
+            time_t = datetime.datetime.utcnow()
+            dataset = json.load(open(annotation_file, 'r'))
+            questions = json.load(open(question_file, 'r'))
+            self.dataset = dataset
+            self.questions = questions
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
+        qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
+        qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
+        for ann in self.dataset['annotations']:
+            imgToQA[ann['image_id']] += [ann]
+            qa[ann['question_id']] = ann
+        for quest in self.questions['questions']:
+            qqa[quest['question_id']] = quest
+        print('index created!')
+
+        # create class members
+        self.qa = qa
+        self.qqa = qqa
+        self.imgToQA = imgToQA
+
+    def info(self):
+        """Print information about the VQA annotation file.
+
+        :return:
+        """
+        for key, value in self.dataset['info'].items():
+            print('%s: %s' % (key, value))
+
+    def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
+        """Get question ids that satisfy given filter conditions. default skips
+        that filter.
+
+        :param  imgIds    (int array)   : get question ids for given imgs
+                        quesTypes (str array)   : get question ids for given question types
+                        ansTypes  (str array)   : get question ids for given answer types
+        :return:    ids   (int array)   : integer array of question ids
+        """
+        imgIds = imgIds if type(imgIds) == list else [imgIds]
+        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
+        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
+
+        if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(imgIds) == 0:
+                anns = sum(
+                    [
+                        self.imgToQA[imgId]
+                        for imgId in imgIds if imgId in self.imgToQA
+                    ],
+                    [],
+                )
+            else:
+                anns = self.dataset['annotations']
+            anns = (anns if len(quesTypes) == 0 else
+                    [ann for ann in anns if ann['question_type'] in quesTypes])
+            anns = (anns if len(ansTypes) == 0 else
+                    [ann for ann in anns if ann['answer_type'] in ansTypes])
+        ids = [ann['question_id'] for ann in anns]
+        return ids
+
+    def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
+        """Get image ids that satisfy given filter conditions. default skips
+        that filter.
+
+         :param quesIds   (int array)   : get image ids for given question ids
+        quesTypes (str array)   : get image ids for given question types
+        ansTypes  (str array)   : get image ids for given answer types
+         :return: ids     (int array)   : integer array of image ids
+        """
+        quesIds = quesIds if type(quesIds) == list else [quesIds]
+        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
+        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
+
+        if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(quesIds) == 0:
+                anns = sum([
+                    self.qa[quesId] for quesId in quesIds if quesId in self.qa
+                ], [])
+            else:
+                anns = self.dataset['annotations']
+            anns = (anns if len(quesTypes) == 0 else
+                    [ann for ann in anns if ann['question_type'] in quesTypes])
+            anns = (anns if len(ansTypes) == 0 else
+                    [ann for ann in anns if ann['answer_type'] in ansTypes])
+        ids = [ann['image_id'] for ann in anns]
+        return ids
+
+    def loadQA(self, ids=[]):
+        """Load questions and answers with the specified question ids.
+
+        :param ids (int array)       : integer ids specifying question ids
+        :return: qa (object array)   : loaded qa objects
+        """
+        if type(ids) == list:
+            return [self.qa[id] for id in ids]
+        elif type(ids) == int:
+            return [self.qa[ids]]
+
+    def showQA(self, anns):
+        """Display the specified annotations.
+
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        for ann in anns:
+            quesId = ann['question_id']
+            print('Question: %s' % (self.qqa[quesId]['question']))
+            for ann in ann['answers']:
+                print('Answer %d: %s' % (ann['answer_id'], ann['answer']))
+
+    def loadRes(self, resFile, quesFile):
+        """Load result file and return a result object.
+
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = VQA()
+        res.questions = json.load(open(quesFile))
+        res.dataset['info'] = copy.deepcopy(self.questions['info'])
+        res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
+        res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
+        res.dataset['data_subtype'] = copy.deepcopy(
+            self.questions['data_subtype'])
+        res.dataset['license'] = copy.deepcopy(self.questions['license'])
+
+        print('Loading and preparing results...     ')
+        time_t = datetime.datetime.utcnow()
+        anns = json.load(open(resFile))
+        assert type(anns) == list, 'results is not an array of objects'
+        annsQuesIds = [ann['question_id'] for ann in anns]
+        assert set(annsQuesIds) == set(
+            self.getQuesIds()
+        ), 'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is at least one question id that does not belong to the question ids in the annotation file.'
+        for ann in anns:
+            quesId = ann['question_id']
+            if res.dataset['task_type'] == 'Multiple Choice':
+                assert (
+                    ann['answer'] in self.qqa[quesId]['multiple_choices']
+                ), 'predicted answer is not one of the multiple choices'
+            qaAnn = self.qa[quesId]
+            ann['image_id'] = qaAnn['image_id']
+            ann['question_type'] = qaAnn['question_type']
+            ann['answer_type'] = qaAnn['answer_type']
+        print('DONE (t=%0.2fs)' %
+              ((datetime.datetime.utcnow() - time_t).total_seconds()))
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa_eval.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa_eval.py
new file mode 100644
index 00000000000..218719e3126
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa_eval.py
@@ -0,0 +1,330 @@
+"""Copyright (c) 2022, salesforce.com, inc.
+
+All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+# coding=utf-8
+
+__author__ = 'aagrawal'
+
+import re
+# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
+# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
+import sys
+
+
+class VQAEval:
+
+    def __init__(self, vqa=None, vqaRes=None, n=2):
+        self.n = n
+        self.accuracy = {}
+        self.evalQA = {}
+        self.evalQuesType = {}
+        self.evalAnsType = {}
+        self.vqa = vqa
+        self.vqaRes = vqaRes
+        if vqa is not None:
+            self.params = {'question_id': vqa.getQuesIds()}
+        self.contractions = {
+            'aint': "ain't",
+            'arent': "aren't",
+            'cant': "can't",
+            'couldve': "could've",
+            'couldnt': "couldn't",
+            "couldn'tve": "couldn't've",
+            "couldnt've": "couldn't've",
+            'didnt': "didn't",
+            'doesnt': "doesn't",
+            'dont': "don't",
+            'hadnt': "hadn't",
+            "hadnt've": "hadn't've",
+            "hadn'tve": "hadn't've",
+            'hasnt': "hasn't",
+            'havent': "haven't",
+            'hed': "he'd",
+            "hed've": "he'd've",
+            "he'dve": "he'd've",
+            'hes': "he's",
+            'howd': "how'd",
+            'howll': "how'll",
+            'hows': "how's",
+            "Id've": "I'd've",
+            "I'dve": "I'd've",
+            'Im': "I'm",
+            'Ive': "I've",
+            'isnt': "isn't",
+            'itd': "it'd",
+            "itd've": "it'd've",
+            "it'dve": "it'd've",
+            'itll': "it'll",
+            "let's": "let's",
+            'maam': "ma'am",
+            'mightnt': "mightn't",
+            "mightnt've": "mightn't've",
+            "mightn'tve": "mightn't've",
+            'mightve': "might've",
+            'mustnt': "mustn't",
+            'mustve': "must've",
+            'neednt': "needn't",
+            'notve': "not've",
+            'oclock': "o'clock",
+            'oughtnt': "oughtn't",
+            "ow's'at": "'ow's'at",
+            "'ows'at": "'ow's'at",
+            "'ow'sat": "'ow's'at",
+            'shant': "shan't",
+            "shed've": "she'd've",
+            "she'dve": "she'd've",
+            "she's": "she's",
+            'shouldve': "should've",
+            'shouldnt': "shouldn't",
+            "shouldnt've": "shouldn't've",
+            "shouldn'tve": "shouldn't've",
+            "somebody'd": 'somebodyd',
+            "somebodyd've": "somebody'd've",
+            "somebody'dve": "somebody'd've",
+            'somebodyll': "somebody'll",
+            'somebodys': "somebody's",
+            'someoned': "someone'd",
+            "someoned've": "someone'd've",
+            "someone'dve": "someone'd've",
+            'someonell': "someone'll",
+            'someones': "someone's",
+            'somethingd': "something'd",
+            "somethingd've": "something'd've",
+            "something'dve": "something'd've",
+            'somethingll': "something'll",
+            'thats': "that's",
+            'thered': "there'd",
+            "thered've": "there'd've",
+            "there'dve": "there'd've",
+            'therere': "there're",
+            'theres': "there's",
+            'theyd': "they'd",
+            "theyd've": "they'd've",
+            "they'dve": "they'd've",
+            'theyll': "they'll",
+            'theyre': "they're",
+            'theyve': "they've",
+            'twas': "'twas",
+            'wasnt': "wasn't",
+            "wed've": "we'd've",
+            "we'dve": "we'd've",
+            'weve': "we've",
+            'werent': "weren't",
+            'whatll': "what'll",
+            'whatre': "what're",
+            'whats': "what's",
+            'whatve': "what've",
+            'whens': "when's",
+            'whered': "where'd",
+            'wheres': "where's",
+            'whereve': "where've",
+            'whod': "who'd",
+            "whod've": "who'd've",
+            "who'dve": "who'd've",
+            'wholl': "who'll",
+            'whos': "who's",
+            'whove': "who've",
+            'whyll': "why'll",
+            'whyre': "why're",
+            'whys': "why's",
+            'wont': "won't",
+            'wouldve': "would've",
+            'wouldnt': "wouldn't",
+            "wouldnt've": "wouldn't've",
+            "wouldn'tve": "wouldn't've",
+            'yall': "y'all",
+            "yall'll": "y'all'll",
+            "y'allll": "y'all'll",
+            "yall'd've": "y'all'd've",
+            "y'alld've": "y'all'd've",
+            "y'all'dve": "y'all'd've",
+            'youd': "you'd",
+            "youd've": "you'd've",
+            "you'dve": "you'd've",
+            'youll': "you'll",
+            'youre': "you're",
+            'youve': "you've",
+        }
+        self.manualMap = {
+            'none': '0',
+            'zero': '0',
+            'one': '1',
+            'two': '2',
+            'three': '3',
+            'four': '4',
+            'five': '5',
+            'six': '6',
+            'seven': '7',
+            'eight': '8',
+            'nine': '9',
+            'ten': '10',
+        }
+        self.articles = ['a', 'an', 'the']
+
+        self.periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')
+        self.commaStrip = re.compile('(\d)(,)(\d)')
+        self.punct = [
+            ';',
+            r'/',
+            '[',
+            ']',
+            '"',
+            '{',
+            '}',
+            '(',
+            ')',
+            '=',
+            '+',
+            '\\',
+            '_',
+            '-',
+            '>',
+            '<',
+            '@',
+            '`',
+            ',',
+            '?',
+            '!',
+        ]
+
+    def evaluate(self, quesIds=None):
+        if quesIds == None:
+            quesIds = [quesId for quesId in self.params['question_id']]
+        gts = {}
+        res = {}
+        for quesId in quesIds:
+            gts[quesId] = self.vqa.qa[quesId]
+            res[quesId] = self.vqaRes.qa[quesId]
+
+        # =================================================
+        # Compute accuracy
+        # =================================================
+        accQA = []
+        accQuesType = {}
+        accAnsType = {}
+        print('computing accuracy')
+        step = 0
+        for quesId in quesIds:
+            resAns = res[quesId]['answer']
+            resAns = resAns.replace('\n', ' ')
+            resAns = resAns.replace('\t', ' ')
+            resAns = resAns.strip()
+            resAns = self.processPunctuation(resAns)
+            resAns = self.processDigitArticle(resAns)
+            gtAcc = []
+            gtAnswers = [ann['answer'] for ann in gts[quesId]['answers']]
+            if len(set(gtAnswers)) > 1:
+                for ansDic in gts[quesId]['answers']:
+                    ansDic['answer'] = self.processPunctuation(
+                        ansDic['answer'])
+            for gtAnsDatum in gts[quesId]['answers']:
+                otherGTAns = [
+                    item for item in gts[quesId]['answers']
+                    if item != gtAnsDatum
+                ]
+                matchingAns = [
+                    item for item in otherGTAns if item['answer'] == resAns
+                ]
+                acc = min(1, float(len(matchingAns)) / 3)
+                gtAcc.append(acc)
+            quesType = gts[quesId]['question_type']
+            ansType = gts[quesId]['answer_type']
+            avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
+            accQA.append(avgGTAcc)
+            if quesType not in accQuesType:
+                accQuesType[quesType] = []
+            accQuesType[quesType].append(avgGTAcc)
+            if ansType not in accAnsType:
+                accAnsType[ansType] = []
+            accAnsType[ansType].append(avgGTAcc)
+            self.setEvalQA(quesId, avgGTAcc)
+            self.setEvalQuesType(quesId, quesType, avgGTAcc)
+            self.setEvalAnsType(quesId, ansType, avgGTAcc)
+            if step % 100 == 0:
+                self.updateProgress(step / float(len(quesIds)))
+            step = step + 1
+
+        self.setAccuracy(accQA, accQuesType, accAnsType)
+        print('Done computing accuracy')
+
+    def processPunctuation(self, inText):
+        outText = inText
+        for p in self.punct:
+            if (p + ' ' in inText or ' ' + p
+                    in inText) or (re.search(self.commaStrip, inText) != None):
+                outText = outText.replace(p, '')
+            else:
+                outText = outText.replace(p, ' ')
+        outText = self.periodStrip.sub('', outText, re.UNICODE)
+        return outText
+
+    def processDigitArticle(self, inText):
+        outText = []
+        tempText = inText.lower().split()
+        for word in tempText:
+            word = self.manualMap.setdefault(word, word)
+            if word not in self.articles:
+                outText.append(word)
+            else:
+                pass
+        for wordId, word in enumerate(outText):
+            if word in self.contractions:
+                outText[wordId] = self.contractions[word]
+        outText = ' '.join(outText)
+        return outText
+
+    def setAccuracy(self, accQA, accQuesType, accAnsType):
+        self.accuracy['overall'] = round(100 * float(sum(accQA)) / len(accQA),
+                                         self.n)
+        self.accuracy['perQuestionType'] = {
+            quesType: round(
+                100 * float(sum(accQuesType[quesType])) /
+                len(accQuesType[quesType]),
+                self.n,
+            )
+            for quesType in accQuesType
+        }
+        self.accuracy['perAnswerType'] = {
+            ansType: round(
+                100 * float(sum(accAnsType[ansType])) /
+                len(accAnsType[ansType]), self.n)
+            for ansType in accAnsType
+        }
+
+    def setEvalQA(self, quesId, acc):
+        self.evalQA[quesId] = round(100 * acc, self.n)
+
+    def setEvalQuesType(self, quesId, quesType, acc):
+        if quesType not in self.evalQuesType:
+            self.evalQuesType[quesType] = {}
+        self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
+
+    def setEvalAnsType(self, quesId, ansType, acc):
+        if ansType not in self.evalAnsType:
+            self.evalAnsType[ansType] = {}
+        self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
+
+    def updateProgress(self, progress):
+        barLength = 20
+        status = ''
+        if isinstance(progress, int):
+            progress = float(progress)
+        if not isinstance(progress, float):
+            progress = 0
+            status = 'error: progress var must be float\r\n'
+        if progress < 0:
+            progress = 0
+            status = 'Halt...\r\n'
+        if progress >= 1:
+            progress = 1
+            status = 'Done...\r\n'
+        block = int(round(barLength * progress))
+        text = '\rFinshed Percent: [{0}] {1}% {2}'.format(
+            '#' * block + '-' * (barLength - block), int(progress * 100),
+            status)
+        sys.stdout.write(text)
+        sys.stdout.flush()
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
new file mode 100644
index 00000000000..9269fbec37e
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -x
+device=0
+
+# --quant_vision    ## for vision quantization
+
+CUDA_VISIBLE_DEVICES=$device \
+python3 main.py \
+--model_name=Qwen/Qwen-VL \
+--bits 4 \
+--group_size 128 \
+--iters 200 \
+--seqlen 512 \
+--disable_quanted_input \
+--model_dtype fp32 \
+--image_folder /path/to/coco/images/train2017/ \
+--question_file /path/to/Qwen-VL_mix665k.json \
+--output_dir "./tmp_autoround"
+
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_eval.sh
new file mode 100644
index 00000000000..49dc90a25f7
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_eval.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+device=0
+
+model_path='./tmp_autoround'
+model=Qwen-VL
+
+CUDA_VISIBLE_DEVICES=$device python3 mm_evaluation/main.py \
+--model_name ${model_path}/${model} \
+--trust_remote_code \
+--eval_bs 4
+
+
+
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt
new file mode 100644
index 00000000000..2d060638bbc
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt
@@ -0,0 +1,14 @@
+transformers
+torch
+tiktoken
+transformers_stream_generator
+peft
+sentencepiece
+einops
+accelerate
+datasets
+protobuf
+auto-gptq
+openpyxl
+wandb
+py-cpuinfo
\ No newline at end of file

From 8cc273be62cb562eaadecce0d1367c9a4aea1735 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 26 Sep 2024 14:02:03 +0800
Subject: [PATCH 02/33] bugfix, add utils

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 neural_compressor/common/base_config.py  |   3 +-
 neural_compressor/torch/utils/utility.py | 226 +++++++++++++++++++++++
 2 files changed, 228 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py
index d54e2e6515b..faf3fcf67cf 100644
--- a/neural_compressor/common/base_config.py
+++ b/neural_compressor/common/base_config.py
@@ -291,7 +291,7 @@ def set_local(self, operator_name_or_list: Union[List, str, Callable], config: B
                 self.local_config[operator_name] = config
         else:
             if operator_name_or_list in self.local_config:
-                logger.warning("The configuration for %s has already been set, update it.", operator_name)
+                logger.warning("The configuration for %s has already been set, update it.", operator_name_or_list)
             self.local_config[operator_name_or_list] = config
         return self
 
@@ -931,3 +931,4 @@ def tensorboard(self, tensorboard):
 
 
 options = Options()
+
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index 2a6fe5aae64..ca2cc474eba 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -340,3 +340,229 @@ def load_empty_model(pretrained_model_name_or_path, cls=None, **kwargs):
     model.eval()
     model.path = pretrained_model_name_or_path
     return model
+
+
+
+import torch
+import transformers
+
+
+def get_module(module, key):
+    """Get module from model by key name.
+
+    Args:
+        module (torch.nn.Module): original model
+        key (str): module name to be replaced
+    """
+    name_list = key.split(".")
+    for name in name_list:
+        module = getattr(module, name, None)
+    return module
+
+
+def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
+                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
+    """Retrieves the names of layers within each block of the model.
+
+    Returns:
+        list: A list of strings, where each string is the name of a layer
+              within a block of the model.
+    """
+    for n, m in model.named_modules():
+        if isinstance(m, tuple(supported_types)):
+            m.tmp_name = n
+    layers_in_block = []
+    if bool(quant_block_list):
+        all_blocks = quant_block_list
+    else:
+        all_blocks = get_block_names(model)
+    for block_names in all_blocks:
+        for block_name in block_names:
+            block = get_module(model, block_name)
+            for n, m in block.named_modules():
+                if hasattr(m, "tmp_name"):
+                    layers_in_block.append(m.tmp_name)
+    for n, m in model.named_modules():
+        if hasattr(m, "tmp_name"):
+            delattr(m, "tmp_name")
+    return layers_in_block
+
+
+def to_dtype(input, dtype=torch.float32):
+    """Moves input data to the specified data type.
+
+    Args:
+    input: The input data to be moved.
+    dtype: The target data type.
+
+    Returns:
+    The input data on the specified data type.
+    """
+    if input is None:
+        return None
+    if isinstance(input, torch.Tensor):
+        return input.to(dtype)
+    if isinstance(input, dict) or isinstance(input, UserDict):
+        for inp in input.keys():
+            input[inp] = to_dtype(input[inp], dtype)
+
+    elif isinstance(input, list) or isinstance(input, tuple):
+        if len(input) == 0:
+            return input
+        input_res = []
+        for inp in input:
+            input_res.append(to_dtype(inp, dtype))
+        if isinstance(input, tuple):
+            input_res = tuple(input_res)
+        input = input_res
+
+    return input
+
+# for VLM usage
+def to_device(input, device=torch.device("cpu")):
+    """Moves input data to the specified device.
+
+    Args:
+    input: The input data to be moved.
+    device: The target device.
+
+    Returns:
+    The input data on the specified device.
+    """
+    if input is None:
+        return None
+    if isinstance(input, torch.Tensor):
+        return input.to(device)
+    if isinstance(input, dict) or isinstance(input, UserDict):
+        for inp in input.keys():
+            input[inp] = to_device(input[inp], device)
+
+    elif isinstance(input, list) or isinstance(input, tuple):
+        if len(input) == 0:
+            return input
+        input_res = []
+        for inp in input:
+            input_res.append(to_device(inp, device))
+        if isinstance(input, tuple):
+            input_res = tuple(input_res)
+        input = input_res
+
+    return input
+
+
+def validate_modules(module_names):
+    """
+    Test a list of modules' validity.
+
+    Args:
+    modules (list of str): List of strings to be validated.
+
+    Returns:
+    bool: True if all modules have equal length or not dependent, otherwise False.
+    """
+    if not bool(module_names):  # pragma: no cover
+        raise ValueError(f"Empty modules")
+    if len(module_names) < 2:
+        return True
+    split_modules = [s.split('.') for s, _ in module_names]
+    lengths = [len(parts) for parts in split_modules]
+    if len(set(lengths)) == 1:  # pragma: no cover
+        return True
+    max_length = max(lengths)
+    min_length = min(lengths)
+    longest_module = next(s for s in split_modules if len(s) == max_length)
+    shortest_module = next(s for s in split_modules if len(s) == min_length)
+    shortest_module = '.'.join(shortest_module)
+    longest_module = '.'.join(longest_module)
+    # Check if the shortest name is a substring of the longest name
+    if shortest_module in longest_module:  # pragma: no cover
+        raise ValueError(f"Invalid modules, at least two modules detected" \
+                         " as dependent, {shortest_module} and {longest_module}")
+    return True
+
+
+def get_multimodal_block_names(model, quant_vision=False):
+    """Get the multimodal model block names for transformers-like networks.
+
+    Args:
+    model: The model.
+
+    Returns:
+    block_names: A list whose elements are list of block's layer names
+    """
+    block_names = []
+    target_modules = []
+    Vison_blocks_tuple = ("vision", "visual",)
+    for n, m in model.named_modules():
+        if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
+            if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
+                target_modules.append((n, m))
+    validate_modules(target_modules)
+    for i, target_m in enumerate(target_modules):
+        block_names.append([])
+        for n, m in target_m[1].named_children():
+            block_names[i].append(target_m[0] + "." + n)
+    return block_names
+
+
+def detect_device(device=None):
+    def is_valid_digit(s):
+        try:
+            num = int(s)
+            return 0 <= num
+        except:
+            return False
+
+    dev_idx = None
+    if is_valid_digit(device):
+        dev_idx = int(device)
+        device = "auto"
+    if device is None or device == "auto":
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            print("Using GPU device")
+        elif is_optimum_habana_available():  # pragma: no cover
+            device = torch.device("hpu")
+            print("Using HPU device")
+        # Use CPU as a fallback
+        else:
+            device = torch.device("cpu")
+            print("Using CPU device")
+        if dev_idx is not None and str(device) != "cpu":
+            device = str(device) + f":{dev_idx}"
+        return str(device)
+    elif isinstance(device, torch.device):
+        device = str(device)
+    return device
+
+
+def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
+    device = model.orig_model.device
+    total_cnt = 0
+    for org_data in dataloader:
+        if isinstance(org_data, torch.Tensor):
+            input_ids = org_data.to(device)
+            data = input_ids
+        elif isinstance(org_data, tuple) or isinstance(org_data, list):
+                data = org_data
+                input_ids = data[0]
+        else:
+            data = {}
+            for key in org_data.keys():
+                data[key] = to_device(org_data[key], device)
+                if key == 'images':
+                    data[key] = to_dtype(org_data[key], model.orig_model.dtype)
+            input_ids = data["input_ids"]
+        if input_ids.shape[-1] < seqlen:
+            continue
+        
+        if isinstance(data, tuple) or isinstance(data, list):
+            model(*data)
+        elif isinstance(data, dict):
+            model(**data)
+        else:
+            model(data)
+        total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
+        if total_cnt >= nsamples:
+            break
+

From 85975971bf40b74da51b9a1bb4fb1ad122069a67 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 26 Sep 2024 06:13:50 +0000
Subject: [PATCH 03/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/common/base_config.py  |  1 -
 neural_compressor/torch/utils/utility.py | 37 +++++++++++++-----------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py
index faf3fcf67cf..97c50c2333d 100644
--- a/neural_compressor/common/base_config.py
+++ b/neural_compressor/common/base_config.py
@@ -931,4 +931,3 @@ def tensorboard(self, tensorboard):
 
 
 options = Options()
-
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index ca2cc474eba..b96987e3b18 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -342,7 +342,6 @@ def load_empty_model(pretrained_model_name_or_path, cls=None, **kwargs):
     return model
 
 
-
 import torch
 import transformers
 
@@ -360,8 +359,9 @@ def get_module(module, key):
     return module
 
 
-def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
-                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
+def get_layer_names_in_block(
+    model, supported_types=[torch.nn.Linear, transformers.modeling_utils.Conv1D], quant_block_list=None
+):
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -418,6 +418,7 @@ def to_dtype(input, dtype=torch.float32):
 
     return input
 
+
 # for VLM usage
 def to_device(input, device=torch.device("cpu")):
     """Moves input data to the specified device.
@@ -451,8 +452,7 @@ def to_device(input, device=torch.device("cpu")):
 
 
 def validate_modules(module_names):
-    """
-    Test a list of modules' validity.
+    """Test a list of modules' validity.
 
     Args:
     modules (list of str): List of strings to be validated.
@@ -461,10 +461,10 @@ def validate_modules(module_names):
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
     if not bool(module_names):  # pragma: no cover
-        raise ValueError(f"Empty modules")
+        raise ValueError("Empty modules")
     if len(module_names) < 2:
         return True
-    split_modules = [s.split('.') for s, _ in module_names]
+    split_modules = [s.split(".") for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
     if len(set(lengths)) == 1:  # pragma: no cover
         return True
@@ -472,12 +472,13 @@ def validate_modules(module_names):
     min_length = min(lengths)
     longest_module = next(s for s in split_modules if len(s) == max_length)
     shortest_module = next(s for s in split_modules if len(s) == min_length)
-    shortest_module = '.'.join(shortest_module)
-    longest_module = '.'.join(longest_module)
+    shortest_module = ".".join(shortest_module)
+    longest_module = ".".join(longest_module)
     # Check if the shortest name is a substring of the longest name
     if shortest_module in longest_module:  # pragma: no cover
-        raise ValueError(f"Invalid modules, at least two modules detected" \
-                         " as dependent, {shortest_module} and {longest_module}")
+        raise ValueError(
+            "Invalid modules, at least two modules detected" " as dependent, {shortest_module} and {longest_module}"
+        )
     return True
 
 
@@ -492,7 +493,10 @@ def get_multimodal_block_names(model, quant_vision=False):
     """
     block_names = []
     target_modules = []
-    Vison_blocks_tuple = ("vision", "visual",)
+    Vison_blocks_tuple = (
+        "vision",
+        "visual",
+    )
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
@@ -544,18 +548,18 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
             input_ids = org_data.to(device)
             data = input_ids
         elif isinstance(org_data, tuple) or isinstance(org_data, list):
-                data = org_data
-                input_ids = data[0]
+            data = org_data
+            input_ids = data[0]
         else:
             data = {}
             for key in org_data.keys():
                 data[key] = to_device(org_data[key], device)
-                if key == 'images':
+                if key == "images":
                     data[key] = to_dtype(org_data[key], model.orig_model.dtype)
             input_ids = data["input_ids"]
         if input_ids.shape[-1] < seqlen:
             continue
-        
+
         if isinstance(data, tuple) or isinstance(data, list):
             model(*data)
         elif isinstance(data, dict):
@@ -565,4 +569,3 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
         total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
         if total_cnt >= nsamples:
             break
-

From 53b4b32c20ee9352ee1766883033b1dfb98440c0 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 26 Sep 2024 14:56:37 +0800
Subject: [PATCH 04/33] fix docstring issues

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 neural_compressor/torch/utils/utility.py      | 66 ++++++++++++-------
 .../weight_only/test_autoround.py             | 22 +++++++
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index b96987e3b18..06de6299359 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -29,6 +29,7 @@
     detect_processor_type_based_on_hw,
     logger,
 )
+import transformers
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -342,10 +343,6 @@ def load_empty_model(pretrained_model_name_or_path, cls=None, **kwargs):
     return model
 
 
-import torch
-import transformers
-
-
 def get_module(module, key):
     """Get module from model by key name.
 
@@ -359,9 +356,8 @@ def get_module(module, key):
     return module
 
 
-def get_layer_names_in_block(
-    model, supported_types=[torch.nn.Linear, transformers.modeling_utils.Conv1D], quant_block_list=None
-):
+def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
+                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -418,7 +414,6 @@ def to_dtype(input, dtype=torch.float32):
 
     return input
 
-
 # for VLM usage
 def to_device(input, device=torch.device("cpu")):
     """Moves input data to the specified device.
@@ -452,7 +447,8 @@ def to_device(input, device=torch.device("cpu")):
 
 
 def validate_modules(module_names):
-    """Test a list of modules' validity.
+    """
+    Test a list of modules' validity.
 
     Args:
     modules (list of str): List of strings to be validated.
@@ -461,10 +457,10 @@ def validate_modules(module_names):
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
     if not bool(module_names):  # pragma: no cover
-        raise ValueError("Empty modules")
+        raise ValueError(f"Empty modules")
     if len(module_names) < 2:
         return True
-    split_modules = [s.split(".") for s, _ in module_names]
+    split_modules = [s.split('.') for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
     if len(set(lengths)) == 1:  # pragma: no cover
         return True
@@ -472,13 +468,12 @@ def validate_modules(module_names):
     min_length = min(lengths)
     longest_module = next(s for s in split_modules if len(s) == max_length)
     shortest_module = next(s for s in split_modules if len(s) == min_length)
-    shortest_module = ".".join(shortest_module)
-    longest_module = ".".join(longest_module)
+    shortest_module = '.'.join(shortest_module)
+    longest_module = '.'.join(longest_module)
     # Check if the shortest name is a substring of the longest name
     if shortest_module in longest_module:  # pragma: no cover
-        raise ValueError(
-            "Invalid modules, at least two modules detected" " as dependent, {shortest_module} and {longest_module}"
-        )
+        raise ValueError(f"Invalid modules, at least two modules detected" \
+                         " as dependent, {shortest_module} and {longest_module}")
     return True
 
 
@@ -493,10 +488,7 @@ def get_multimodal_block_names(model, quant_vision=False):
     """
     block_names = []
     target_modules = []
-    Vison_blocks_tuple = (
-        "vision",
-        "visual",
-    )
+    Vison_blocks_tuple = ("vision", "visual",)
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
@@ -510,6 +502,19 @@ def get_multimodal_block_names(model, quant_vision=False):
 
 
 def detect_device(device=None):
+    """
+    Detects the device to use for model execution (GPU, HPU, or CPU).
+
+    Args:
+        device (str, int, torch.device, optional): 
+            - If a string ('cuda', 'cpu', or 'hpu') or torch.device is provided, that device is selected.
+            - If an integer is provided, it treats it as a GPU device index.
+            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available, 
+              or falls back to 'cpu'.
+
+    Returns:
+        str: The selected device in string format ('cuda:X', 'hpu', or 'cpu').
+    """
     def is_valid_digit(s):
         try:
             num = int(s)
@@ -541,6 +546,18 @@ def is_valid_digit(s):
 
 
 def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
+    """
+    Runs a model on a provided dataset with automatic device detection for vector-language models.
+
+    Args:
+        model: The model to run.
+        dataloader: A PyTorch dataloader providing the input data for the model.
+        seqlen (int, optional): The minimum sequence length of input data to process. Defaults to 512.
+        nsamples (int, optional): The number of samples to process before stopping. Defaults to 512.
+
+    Returns:
+        None
+    """
     device = model.orig_model.device
     total_cnt = 0
     for org_data in dataloader:
@@ -548,18 +565,18 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
             input_ids = org_data.to(device)
             data = input_ids
         elif isinstance(org_data, tuple) or isinstance(org_data, list):
-            data = org_data
-            input_ids = data[0]
+                data = org_data
+                input_ids = data[0]
         else:
             data = {}
             for key in org_data.keys():
                 data[key] = to_device(org_data[key], device)
-                if key == "images":
+                if key == 'images':
                     data[key] = to_dtype(org_data[key], model.orig_model.dtype)
             input_ids = data["input_ids"]
         if input_ids.shape[-1] < seqlen:
             continue
-
+        
         if isinstance(data, tuple) or isinstance(data, list):
             model(*data)
         elif isinstance(data, dict):
@@ -569,3 +586,4 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
         total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
         if total_cnt >= nsamples:
             break
+
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 8a3942e3f98..2e1eef0cc5b 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -162,6 +162,27 @@ def test_conv1d(self):
         assert torch.allclose(out2, out1, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
         assert isinstance(q_model.h[0].attn.c_attn, WeightOnlyLinear), "loading compressed model failed."
 
+    def test_utils(self):
+        from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
+                                                            get_layer_names_in_block,
+                                                            detect_device,
+                                                            run_fn_for_vlm_autoround)
+        fp32_model = copy.deepcopy(self.gptj)
+        quant_block_list = get_multimodal_block_names(fp32_model, quant_vision=True)
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp16", quant_block_list=quant_block_list)
+        logger.info(f"Test AutoRound with config {quant_config}")
+        device = detect_device("auto")
+        layers_list = get_layer_names_in_block(fp32_model, quant_block_list=quant_block_list)
+        fp32_model.to(device)
+        # quantizer execute
+        model = prepare(model=fp32_model, quant_config=quant_config)
+        run_fn_for_vlm_autoround(model, self.dataloader, seqlen=32, nsamples=8)
+        q_model = convert(model)
+        out = q_model(self.inp)[0]
+        assert torch.allclose(out, self.label, atol=1e-1)
+        assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
+        
+        
     # def test_autoround_format_export(self):
     #     from neural_compressor.torch.quantization import load
     #     from auto_gptq.nn_modules.qlinear.qlinear_triton import QuantLinear
@@ -176,3 +197,4 @@ def test_conv1d(self):
     #     assert isinstance(q_model.transformer.h[0].attn.k_proj, QuantLinear), "packing model failed."
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
+

From f915e4949d461ab4084f79c6d678279a96bdd861 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 26 Sep 2024 06:57:53 +0000
Subject: [PATCH 05/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/utils/utility.py      | 49 ++++++++++---------
 .../weight_only/test_autoround.py             | 19 ++++---
 2 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index 06de6299359..e370ce890b4 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -19,6 +19,7 @@
 
 import psutil
 import torch
+import transformers
 from typing_extensions import TypeAlias
 
 from neural_compressor.common.utils import (
@@ -29,7 +30,6 @@
     detect_processor_type_based_on_hw,
     logger,
 )
-import transformers
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -356,8 +356,9 @@ def get_module(module, key):
     return module
 
 
-def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
-                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
+def get_layer_names_in_block(
+    model, supported_types=[torch.nn.Linear, transformers.modeling_utils.Conv1D], quant_block_list=None
+):
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -414,6 +415,7 @@ def to_dtype(input, dtype=torch.float32):
 
     return input
 
+
 # for VLM usage
 def to_device(input, device=torch.device("cpu")):
     """Moves input data to the specified device.
@@ -447,8 +449,7 @@ def to_device(input, device=torch.device("cpu")):
 
 
 def validate_modules(module_names):
-    """
-    Test a list of modules' validity.
+    """Test a list of modules' validity.
 
     Args:
     modules (list of str): List of strings to be validated.
@@ -457,10 +458,10 @@ def validate_modules(module_names):
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
     if not bool(module_names):  # pragma: no cover
-        raise ValueError(f"Empty modules")
+        raise ValueError("Empty modules")
     if len(module_names) < 2:
         return True
-    split_modules = [s.split('.') for s, _ in module_names]
+    split_modules = [s.split(".") for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
     if len(set(lengths)) == 1:  # pragma: no cover
         return True
@@ -468,12 +469,13 @@ def validate_modules(module_names):
     min_length = min(lengths)
     longest_module = next(s for s in split_modules if len(s) == max_length)
     shortest_module = next(s for s in split_modules if len(s) == min_length)
-    shortest_module = '.'.join(shortest_module)
-    longest_module = '.'.join(longest_module)
+    shortest_module = ".".join(shortest_module)
+    longest_module = ".".join(longest_module)
     # Check if the shortest name is a substring of the longest name
     if shortest_module in longest_module:  # pragma: no cover
-        raise ValueError(f"Invalid modules, at least two modules detected" \
-                         " as dependent, {shortest_module} and {longest_module}")
+        raise ValueError(
+            "Invalid modules, at least two modules detected" " as dependent, {shortest_module} and {longest_module}"
+        )
     return True
 
 
@@ -488,7 +490,10 @@ def get_multimodal_block_names(model, quant_vision=False):
     """
     block_names = []
     target_modules = []
-    Vison_blocks_tuple = ("vision", "visual",)
+    Vison_blocks_tuple = (
+        "vision",
+        "visual",
+    )
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
@@ -502,19 +507,19 @@ def get_multimodal_block_names(model, quant_vision=False):
 
 
 def detect_device(device=None):
-    """
-    Detects the device to use for model execution (GPU, HPU, or CPU).
+    """Detects the device to use for model execution (GPU, HPU, or CPU).
 
     Args:
-        device (str, int, torch.device, optional): 
+        device (str, int, torch.device, optional):
             - If a string ('cuda', 'cpu', or 'hpu') or torch.device is provided, that device is selected.
             - If an integer is provided, it treats it as a GPU device index.
-            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available, 
+            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available,
               or falls back to 'cpu'.
 
     Returns:
         str: The selected device in string format ('cuda:X', 'hpu', or 'cpu').
     """
+
     def is_valid_digit(s):
         try:
             num = int(s)
@@ -546,8 +551,7 @@ def is_valid_digit(s):
 
 
 def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
-    """
-    Runs a model on a provided dataset with automatic device detection for vector-language models.
+    """Runs a model on a provided dataset with automatic device detection for vector-language models.
 
     Args:
         model: The model to run.
@@ -565,18 +569,18 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
             input_ids = org_data.to(device)
             data = input_ids
         elif isinstance(org_data, tuple) or isinstance(org_data, list):
-                data = org_data
-                input_ids = data[0]
+            data = org_data
+            input_ids = data[0]
         else:
             data = {}
             for key in org_data.keys():
                 data[key] = to_device(org_data[key], device)
-                if key == 'images':
+                if key == "images":
                     data[key] = to_dtype(org_data[key], model.orig_model.dtype)
             input_ids = data["input_ids"]
         if input_ids.shape[-1] < seqlen:
             continue
-        
+
         if isinstance(data, tuple) or isinstance(data, list):
             model(*data)
         elif isinstance(data, dict):
@@ -586,4 +590,3 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
         total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
         if total_cnt >= nsamples:
             break
-
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 2e1eef0cc5b..79245345cbb 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -163,13 +163,18 @@ def test_conv1d(self):
         assert isinstance(q_model.h[0].attn.c_attn, WeightOnlyLinear), "loading compressed model failed."
 
     def test_utils(self):
-        from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                            get_layer_names_in_block,
-                                                            detect_device,
-                                                            run_fn_for_vlm_autoround)
+        from neural_compressor.torch.utils.utility import (
+            detect_device,
+            get_layer_names_in_block,
+            get_multimodal_block_names,
+            run_fn_for_vlm_autoround,
+        )
+
         fp32_model = copy.deepcopy(self.gptj)
         quant_block_list = get_multimodal_block_names(fp32_model, quant_vision=True)
-        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp16", quant_block_list=quant_block_list)
+        quant_config = AutoRoundConfig(
+            nsamples=32, seqlen=10, iters=10, scale_dtype="fp16", quant_block_list=quant_block_list
+        )
         logger.info(f"Test AutoRound with config {quant_config}")
         device = detect_device("auto")
         layers_list = get_layer_names_in_block(fp32_model, quant_block_list=quant_block_list)
@@ -181,8 +186,7 @@ def test_utils(self):
         out = q_model(self.inp)[0]
         assert torch.allclose(out, self.label, atol=1e-1)
         assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
-        
-        
+
     # def test_autoround_format_export(self):
     #     from neural_compressor.torch.quantization import load
     #     from auto_gptq.nn_modules.qlinear.qlinear_triton import QuantLinear
@@ -197,4 +201,3 @@ def test_utils(self):
     #     assert isinstance(q_model.transformer.h[0].attn.k_proj, QuantLinear), "packing model failed."
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
-

From c5127a367223dacc90c4b927438cad4ac0f4a827 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 26 Sep 2024 15:11:36 +0800
Subject: [PATCH 06/33] bugfix

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 neural_compressor/torch/utils/utility.py | 41 ++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index 06de6299359..fc2dd4a5f6c 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -30,6 +30,8 @@
     logger,
 )
 import transformers
+from collections import UserDict
+
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -46,6 +48,22 @@
 QCONFIG_NAME = "qconfig.json"
 
 
+def is_optimum_habana_available():
+    """
+    Checks if the Optimum Habana module is available for use with the transformers library.
+
+    This function checks two conditions:
+    1. If the `optimum` package is available using `transformers.utils.import_utils.is_optimum_available`.
+    2. If the `optimum.habana` module can be found using `importlib.util.find_spec`.
+
+    Returns:
+        bool: True if Optimum Habana is available, False otherwise.
+    """
+    from transformers.utils.import_utils import is_optimum_available
+
+    return is_optimum_available() and importlib.util.find_spec("optimum.habana") is not None
+
+
 def register_algo(name):
     """Decorator function to register algorithms in the algos_mapping dictionary.
 
@@ -414,6 +432,7 @@ def to_dtype(input, dtype=torch.float32):
 
     return input
 
+
 # for VLM usage
 def to_device(input, device=torch.device("cpu")):
     """Moves input data to the specified device.
@@ -446,6 +465,28 @@ def to_device(input, device=torch.device("cpu")):
     return input
 
 
+def get_block_names(model):
+    """Get the block names for transformers-like networks.
+
+    Args:
+    model: The model.
+
+    Returns:
+    block_names: A list whose elements are list of block's layer names
+    """
+    block_names = []
+    target_modules = []
+    for n, m in model.named_modules():
+        if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
+            target_modules.append((n, m))
+            break  ## only find the first modulelist, may be not robust
+    for i, target_m in enumerate(target_modules):
+        block_names.append([])
+        for n, m in target_m[1].named_children():
+            block_names[i].append(target_m[0] + "." + n)
+    return block_names
+
+
 def validate_modules(module_names):
     """
     Test a list of modules' validity.

From 6b4c2ff41876b885801096f09723a008e88ca93f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 26 Sep 2024 07:17:38 +0000
Subject: [PATCH 07/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/utils/utility.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index 14d4844b45a..b5076bd1a1e 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -15,6 +15,7 @@
 
 
 import enum
+from collections import UserDict
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import psutil
@@ -30,8 +31,6 @@
     detect_processor_type_based_on_hw,
     logger,
 )
-import transformers
-from collections import UserDict
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -49,8 +48,7 @@
 
 
 def is_optimum_habana_available():
-    """
-    Checks if the Optimum Habana module is available for use with the transformers library.
+    """Checks if the Optimum Habana module is available for use with the transformers library.
 
     This function checks two conditions:
     1. If the `optimum` package is available using `transformers.utils.import_utils.is_optimum_available`.

From ecd5410b4bff0a50cb461a3c090a0689c233dafc Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 26 Sep 2024 15:38:33 +0800
Subject: [PATCH 08/33] refine examples

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Llava/main.py        | 16 ++++------------
 .../quantization/auto_round/Phi-3-vision/main.py | 14 +++++---------
 .../quantization/auto_round/Qwen-VL/main.py      |  9 ++-------
 3 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
index 07417f013d9..b9bd77c0aef 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
@@ -1,12 +1,11 @@
 import argparse
-# import sys
 parser = argparse.ArgumentParser()
 import torch
 import os
 import transformers
-
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-torch.use_deterministic_algorithms(True, warn_only=True)
+# # os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+# torch.use_deterministic_algorithms(True, warn_only=True)
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
 from transformers import set_seed
 
@@ -21,8 +20,6 @@
 from llava.train.train import preprocess, preprocess_multimodal, DataCollatorForSupervisedDataset
 from llava.model.builder import load_pretrained_model
 from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                    to_device,
-                                                    to_dtype,
                                                     get_layer_names_in_block,
                                                     detect_device,
                                                     run_fn_for_vlm_autoround
@@ -247,8 +244,6 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
             model_name = model_name[:-1]
         print(model_name, flush=True)
 
-        from auto_round.utils import detect_device
-
         device_str = detect_device(args.device)
         torch_dtype = "auto"
         torch_device = torch.device(device_str)
@@ -334,7 +329,6 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
                     
         if not args.quant_lm_head:
                 quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
-                # layer_config[lm_head_layer_name] = {"bits": args.bits}
                 transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
                 if transformers_version[0] == 4 and transformers_version[1] < 38:
                     error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
@@ -364,9 +358,6 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
         model = model.to(torch_device)
         model_path = args.model_name
         model_name = get_model_name_from_path(model_path)
-        # torch_dtype = "auto"
-        # tokenizer, model, image_processor, _ = load_pretrained_model(model_path, model_base=None, model_name=model_name,
-        #         torch_dtype=torch_dtype)
         from mm_evaluation import TextVQAEvaluator
         evaluator = TextVQAEvaluator(
             model,
@@ -383,3 +374,4 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
 
 
 
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
index 351588eaeb5..432afc35d9e 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
@@ -4,9 +4,9 @@
 parser = argparse.ArgumentParser()
 import torch
 import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-torch.use_deterministic_algorithms(True, warn_only=True)
+# os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+# torch.use_deterministic_algorithms(True, warn_only=True)
 import copy
 from PIL import Image
 import json
@@ -26,8 +26,6 @@
 IMAGE_TOKEN_INDEX = -200
 IGNORE_INDEX = -100
 from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                    to_device,
-                                                    to_dtype,
                                                     get_layer_names_in_block,
                                                     detect_device,
                                                     run_fn_for_vlm_autoround
@@ -170,8 +168,7 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
 if __name__ == '__main__':
 
     parser.add_argument(
-        "--model_name", default="microsoft/Phi-3-vision-128k-instruct"
-    )
+        "--model_name", default="microsoft/Phi-3-vision-128k-instruct")
     
     parser.add_argument("--quantize", action="store_true")
     
@@ -391,7 +388,6 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
                     
         if not args.quant_lm_head:
                 quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
-                # layer_config[lm_head_layer_name] = {"bits": args.bits}
                 transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
                 if transformers_version[0] == 4 and transformers_version[1] < 38:
                     error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
@@ -402,7 +398,6 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
         run_fn_for_vlm_autoround(user_model, *run_args)
         user_model = convert(user_model)
 
-        # user_model.save(args.output_dir, format="huggingface")
         from neural_compressor.torch.utils import (LoadFormat,)
         user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE, safe_serialization=False)
         if tokenizer is not None:
@@ -428,3 +423,4 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
     #     print(make_table(res))
 
 
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py
index 22b51252a56..428e985f628 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py
@@ -1,6 +1,4 @@
 import argparse
-# import sys
-# sys.path.insert(0, '../../..')
 parser = argparse.ArgumentParser()
 import torch
 import os
@@ -25,8 +23,6 @@
 DEFAULT_IM_START_TOKEN = '<img>'
 DEFAULT_IM_END_TOKEN = '</img>'
 from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                    to_device,
-                                                    to_dtype,
                                                     get_layer_names_in_block,
                                                     detect_device,
                                                     run_fn_for_vlm_autoround
@@ -487,14 +483,12 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
         run_fn_for_vlm_autoround(user_model, *run_args)
         user_model = convert(user_model)
 
-        # user_model.save(args.output_dir, format="huggingface")
         from neural_compressor.torch.utils import (LoadFormat,)
         user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE)
         if tokenizer is not None:
             tokenizer.save_pretrained(args.output_dir)
 
-    if args.accuracy:  ## TODO
-        # model = model.half()
+    if args.accuracy:
         model_name = args.model_name
         device_str = detect_device(args.device)
         torch_device = torch.device(device_str)
@@ -524,3 +518,4 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
                 )
 
 
+

From 92125758f3663010ae6ae6b1be6b48d708ab9ad3 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 26 Sep 2024 15:52:50 +0800
Subject: [PATCH 09/33] fix scan issue

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 neural_compressor/torch/utils/utility.py | 54 ++++++++++++------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index b5076bd1a1e..7c89fec8652 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -15,12 +15,10 @@
 
 
 import enum
-from collections import UserDict
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import psutil
 import torch
-import transformers
 from typing_extensions import TypeAlias
 
 from neural_compressor.common.utils import (
@@ -31,6 +29,9 @@
     detect_processor_type_based_on_hw,
     logger,
 )
+import transformers
+from collections import UserDict
+import importlib
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -48,7 +49,8 @@
 
 
 def is_optimum_habana_available():
-    """Checks if the Optimum Habana module is available for use with the transformers library.
+    """
+    Checks if the Optimum Habana module is available for use with the transformers library.
 
     This function checks two conditions:
     1. If the `optimum` package is available using `transformers.utils.import_utils.is_optimum_available`.
@@ -372,9 +374,8 @@ def get_module(module, key):
     return module
 
 
-def get_layer_names_in_block(
-    model, supported_types=[torch.nn.Linear, transformers.modeling_utils.Conv1D], quant_block_list=None
-):
+def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
+                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -487,7 +488,8 @@ def get_block_names(model):
 
 
 def validate_modules(module_names):
-    """Test a list of modules' validity.
+    """
+    Test a list of modules' validity.
 
     Args:
     modules (list of str): List of strings to be validated.
@@ -496,10 +498,10 @@ def validate_modules(module_names):
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
     if not bool(module_names):  # pragma: no cover
-        raise ValueError("Empty modules")
+        raise ValueError(f"Empty modules")
     if len(module_names) < 2:
         return True
-    split_modules = [s.split(".") for s, _ in module_names]
+    split_modules = [s.split('.') for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
     if len(set(lengths)) == 1:  # pragma: no cover
         return True
@@ -507,13 +509,12 @@ def validate_modules(module_names):
     min_length = min(lengths)
     longest_module = next(s for s in split_modules if len(s) == max_length)
     shortest_module = next(s for s in split_modules if len(s) == min_length)
-    shortest_module = ".".join(shortest_module)
-    longest_module = ".".join(longest_module)
+    shortest_module = '.'.join(shortest_module)
+    longest_module = '.'.join(longest_module)
     # Check if the shortest name is a substring of the longest name
     if shortest_module in longest_module:  # pragma: no cover
-        raise ValueError(
-            "Invalid modules, at least two modules detected" " as dependent, {shortest_module} and {longest_module}"
-        )
+        raise ValueError(f"Invalid modules, at least two modules detected" \
+                         " as dependent, {shortest_module} and {longest_module}")
     return True
 
 
@@ -528,10 +529,7 @@ def get_multimodal_block_names(model, quant_vision=False):
     """
     block_names = []
     target_modules = []
-    Vison_blocks_tuple = (
-        "vision",
-        "visual",
-    )
+    Vison_blocks_tuple = ("vision", "visual",)
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
@@ -545,19 +543,19 @@ def get_multimodal_block_names(model, quant_vision=False):
 
 
 def detect_device(device=None):
-    """Detects the device to use for model execution (GPU, HPU, or CPU).
+    """
+    Detects the device to use for model execution (GPU, HPU, or CPU).
 
     Args:
-        device (str, int, torch.device, optional):
+        device (str, int, torch.device, optional): 
             - If a string ('cuda', 'cpu', or 'hpu') or torch.device is provided, that device is selected.
             - If an integer is provided, it treats it as a GPU device index.
-            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available,
+            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available, 
               or falls back to 'cpu'.
 
     Returns:
         str: The selected device in string format ('cuda:X', 'hpu', or 'cpu').
     """
-
     def is_valid_digit(s):
         try:
             num = int(s)
@@ -589,7 +587,8 @@ def is_valid_digit(s):
 
 
 def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
-    """Runs a model on a provided dataset with automatic device detection for vector-language models.
+    """
+    Runs a model on a provided dataset with automatic device detection for vector-language models.
 
     Args:
         model: The model to run.
@@ -607,18 +606,18 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
             input_ids = org_data.to(device)
             data = input_ids
         elif isinstance(org_data, tuple) or isinstance(org_data, list):
-            data = org_data
-            input_ids = data[0]
+                data = org_data
+                input_ids = data[0]
         else:
             data = {}
             for key in org_data.keys():
                 data[key] = to_device(org_data[key], device)
-                if key == "images":
+                if key == 'images':
                     data[key] = to_dtype(org_data[key], model.orig_model.dtype)
             input_ids = data["input_ids"]
         if input_ids.shape[-1] < seqlen:
             continue
-
+        
         if isinstance(data, tuple) or isinstance(data, list):
             model(*data)
         elif isinstance(data, dict):
@@ -628,3 +627,4 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
         total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
         if total_cnt >= nsamples:
             break
+

From 533afd076fbe4ba69277758437863e503091b2c6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 26 Sep 2024 07:57:13 +0000
Subject: [PATCH 10/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/utils/utility.py | 55 ++++++++++++------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index 7c89fec8652..dbd235cbd58 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -15,10 +15,13 @@
 
 
 import enum
+import importlib
+from collections import UserDict
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import psutil
 import torch
+import transformers
 from typing_extensions import TypeAlias
 
 from neural_compressor.common.utils import (
@@ -29,9 +32,6 @@
     detect_processor_type_based_on_hw,
     logger,
 )
-import transformers
-from collections import UserDict
-import importlib
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -49,8 +49,7 @@
 
 
 def is_optimum_habana_available():
-    """
-    Checks if the Optimum Habana module is available for use with the transformers library.
+    """Checks if the Optimum Habana module is available for use with the transformers library.
 
     This function checks two conditions:
     1. If the `optimum` package is available using `transformers.utils.import_utils.is_optimum_available`.
@@ -374,8 +373,9 @@ def get_module(module, key):
     return module
 
 
-def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
-                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
+def get_layer_names_in_block(
+    model, supported_types=[torch.nn.Linear, transformers.modeling_utils.Conv1D], quant_block_list=None
+):
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -488,8 +488,7 @@ def get_block_names(model):
 
 
 def validate_modules(module_names):
-    """
-    Test a list of modules' validity.
+    """Test a list of modules' validity.
 
     Args:
     modules (list of str): List of strings to be validated.
@@ -498,10 +497,10 @@ def validate_modules(module_names):
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
     if not bool(module_names):  # pragma: no cover
-        raise ValueError(f"Empty modules")
+        raise ValueError("Empty modules")
     if len(module_names) < 2:
         return True
-    split_modules = [s.split('.') for s, _ in module_names]
+    split_modules = [s.split(".") for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
     if len(set(lengths)) == 1:  # pragma: no cover
         return True
@@ -509,12 +508,13 @@ def validate_modules(module_names):
     min_length = min(lengths)
     longest_module = next(s for s in split_modules if len(s) == max_length)
     shortest_module = next(s for s in split_modules if len(s) == min_length)
-    shortest_module = '.'.join(shortest_module)
-    longest_module = '.'.join(longest_module)
+    shortest_module = ".".join(shortest_module)
+    longest_module = ".".join(longest_module)
     # Check if the shortest name is a substring of the longest name
     if shortest_module in longest_module:  # pragma: no cover
-        raise ValueError(f"Invalid modules, at least two modules detected" \
-                         " as dependent, {shortest_module} and {longest_module}")
+        raise ValueError(
+            "Invalid modules, at least two modules detected" " as dependent, {shortest_module} and {longest_module}"
+        )
     return True
 
 
@@ -529,7 +529,10 @@ def get_multimodal_block_names(model, quant_vision=False):
     """
     block_names = []
     target_modules = []
-    Vison_blocks_tuple = ("vision", "visual",)
+    Vison_blocks_tuple = (
+        "vision",
+        "visual",
+    )
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
@@ -543,19 +546,19 @@ def get_multimodal_block_names(model, quant_vision=False):
 
 
 def detect_device(device=None):
-    """
-    Detects the device to use for model execution (GPU, HPU, or CPU).
+    """Detects the device to use for model execution (GPU, HPU, or CPU).
 
     Args:
-        device (str, int, torch.device, optional): 
+        device (str, int, torch.device, optional):
             - If a string ('cuda', 'cpu', or 'hpu') or torch.device is provided, that device is selected.
             - If an integer is provided, it treats it as a GPU device index.
-            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available, 
+            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available,
               or falls back to 'cpu'.
 
     Returns:
         str: The selected device in string format ('cuda:X', 'hpu', or 'cpu').
     """
+
     def is_valid_digit(s):
         try:
             num = int(s)
@@ -587,8 +590,7 @@ def is_valid_digit(s):
 
 
 def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
-    """
-    Runs a model on a provided dataset with automatic device detection for vector-language models.
+    """Runs a model on a provided dataset with automatic device detection for vector-language models.
 
     Args:
         model: The model to run.
@@ -606,18 +608,18 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
             input_ids = org_data.to(device)
             data = input_ids
         elif isinstance(org_data, tuple) or isinstance(org_data, list):
-                data = org_data
-                input_ids = data[0]
+            data = org_data
+            input_ids = data[0]
         else:
             data = {}
             for key in org_data.keys():
                 data[key] = to_device(org_data[key], device)
-                if key == 'images':
+                if key == "images":
                     data[key] = to_dtype(org_data[key], model.orig_model.dtype)
             input_ids = data["input_ids"]
         if input_ids.shape[-1] < seqlen:
             continue
-        
+
         if isinstance(data, tuple) or isinstance(data, list):
             model(*data)
         elif isinstance(data, dict):
@@ -627,4 +629,3 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
         total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
         if total_cnt >= nsamples:
             break
-

From 21e1dbb89b3248f31f8beab448a27ebafc117c4b Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 26 Sep 2024 16:10:07 +0800
Subject: [PATCH 11/33] refine shell

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Phi-3-vision/run_autoround.sh      | 1 -
 .../quantization/auto_round/Qwen-VL/run_autoround.sh           | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
index d01f166ff7f..25e0bc81ccc 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
@@ -6,7 +6,6 @@ CUDA_VISIBLE_DEVICES=$device \
 python3 main.py \
 --model_name=$model_name \
 --nsamples 512 \
---model_dtype fp32 \
 --image_folder /PATH/TO/coco/images/train2017 \
 --question_file /PATH/TO/llava_v1_5_mix665k.json \
 --output_dir "./tmp_autoround"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
index 9269fbec37e..82fc18bcb2d 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
@@ -12,10 +12,7 @@ python3 main.py \
 --iters 200 \
 --seqlen 512 \
 --disable_quanted_input \
---model_dtype fp32 \
 --image_folder /path/to/coco/images/train2017/ \
 --question_file /path/to/Qwen-VL_mix665k.json \
 --output_dir "./tmp_autoround"
 
-
-

From 64d1b3e06e01460639e711563d2b2000e016cc45 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Sun, 29 Sep 2024 14:49:11 +0800
Subject: [PATCH 12/33] refine scripts & requirements

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 359 +++++++++---------
 .../auto_round/Llava/requirements.txt         |  14 +
 .../auto_round/Llava/run_autoround.sh         |   1 +
 .../auto_round/Phi-3-vision/run_autoround.sh  |   1 +
 .../auto_round/{ => Qwen-VL}/requirements.txt |   0
 .../auto_round/Qwen-VL/run_autoround.sh       |   1 +
 6 files changed, 207 insertions(+), 169 deletions(-)
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/requirements.txt
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{ => Qwen-VL}/requirements.txt (100%)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index c3ae3f6b5be..795861e2d36 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -1,172 +1,193 @@
 {
-    "pytorch": {
-      "opt_125m_woq_gptq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_woq_gptq_int4_dq_bnb":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_woq_gptq_int4_dq_ggml":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "llama2_7b_gptq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "llama2_7b_gptq_int4_dq_bnb":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "llama2_7b_gptq_int4_dq_ggml":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_rtn_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_rtn_int4_dq_bnb":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_rtn_int4_dq_ggml":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_gptq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_gptq_int4_dq_bnb":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_gptq_int4_dq_ggml":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "gpt_j_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "opt_125m_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "dlrm_ipex": {
-        "model_src_dir": "recommendation/dlrm/static_quant/ipex",
-        "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
-        "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
-        "main_script": "dlrm_s_pytorch.py",
-        "batch_size": 16384
-      },
-      "resnet18_pt2e_static":{
-        "model_src_dir": "cv/static_quant",
-        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-        "input_model": "",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "resnet18_fp8_static":{
-        "model_src_dir": "cv/fp8_quant",
-        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-        "input_model": "",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "opt_125m_pt2e_static":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "sdxl_ipex_sq":{
-        "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "resnet18_mixed_precision": {
-        "model_src_dir": "cv/mixed_precision",
-        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-        "input_model": "resnet18",
-        "main_script": "main.py",
-        "batch_size": 20
-      }
+  "pytorch": {
+    "llava_woq_autoround_int4":{
+      "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017",
+      "input_model": "liuhaotian/llava-v1.5-7b",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "qwenvl_woq_autoround_int4":{
+      "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017",
+      "input_model": "Qwen/Qwen-VL",
+      "main_script": "main.py",
+      "batch_size": 8
+    },
+    "Phi3Vision_woq_autoround_int4":{
+      "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017",
+      "input_model": "microsoft/Phi-3-vision-128k-instruct",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_gptq_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_gptq_int4_dq_bnb":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_gptq_int4_dq_ggml":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "llama2_7b_gptq_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "llama2_7b_gptq_int4_dq_bnb":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "llama2_7b_gptq_int4_dq_ggml":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_rtn_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_rtn_int4_dq_bnb":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_rtn_int4_dq_ggml":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_gptq_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_gptq_int4_dq_bnb":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_gptq_int4_dq_ggml":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_ipex":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "gpt_j_ipex_sq":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "llama2_7b_ipex":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "llama2_7b_ipex_sq":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_ipex":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "opt_125m_ipex_sq":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "dlrm_ipex": {
+      "model_src_dir": "recommendation/dlrm/static_quant/ipex",
+      "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
+      "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
+      "main_script": "dlrm_s_pytorch.py",
+      "batch_size": 16384
+    },
+    "resnet18_pt2e_static":{
+      "model_src_dir": "cv/static_quant",
+      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+      "input_model": "",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "resnet18_fp8_static":{
+      "model_src_dir": "cv/fp8_quant",
+      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+      "input_model": "",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "opt_125m_pt2e_static":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "sdxl_ipex_sq":{
+      "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "resnet18_mixed_precision": {
+      "model_src_dir": "cv/mixed_precision",
+      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+      "input_model": "resnet18",
+      "main_script": "main.py",
+      "batch_size": 20
     }
+  }
 }
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/requirements.txt
new file mode 100644
index 00000000000..0a3d5a0a420
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/requirements.txt
@@ -0,0 +1,14 @@
+transformers
+torch
+tiktoken
+transformers_stream_generator
+peft
+sentencepiece
+einops
+accelerate
+datasets
+protobuf
+auto-gptq
+openpyxl
+wandb
+py-cpuinfo
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
index 44750141fb4..19b07526e26 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
@@ -9,6 +9,7 @@ python3 main.py \
 --group_size 128 \
 --iters 200 \
 --seqlen 512 \
+--quantize \
 --image_folder /path/to/coco/images/train2017/ \
 --question_file /path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k.json \
 --eval-question-file /path/to/textvqa/llava_textvqa_val_v051_ocr.jsonl \
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
index 25e0bc81ccc..f7feeec9efd 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
@@ -6,6 +6,7 @@ CUDA_VISIBLE_DEVICES=$device \
 python3 main.py \
 --model_name=$model_name \
 --nsamples 512 \
+--quantize \
 --image_folder /PATH/TO/coco/images/train2017 \
 --question_file /PATH/TO/llava_v1_5_mix665k.json \
 --output_dir "./tmp_autoround"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/requirements.txt
similarity index 100%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/requirements.txt
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/requirements.txt
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
index 82fc18bcb2d..47a282501ce 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
@@ -12,6 +12,7 @@ python3 main.py \
 --iters 200 \
 --seqlen 512 \
 --disable_quanted_input \
+--quantize \
 --image_folder /path/to/coco/images/train2017/ \
 --question_file /path/to/Qwen-VL_mix665k.json \
 --output_dir "./tmp_autoround"

From dc368b2fc3752525b8dc220a566aa85a7b51ff44 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Sun, 29 Sep 2024 14:50:41 +0800
Subject: [PATCH 13/33] typofix

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Phi-3-vision/requirements.txt     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt
index 1322923bd00..6bba92c0b02 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/requirements.txt
@@ -14,5 +14,5 @@ wandb
 py-cpuinfo
 Pillow
 torchvision
-lm-eval==0.4.4
-setuptools==70.0.0
\ No newline at end of file
+# lm-eval==0.4.4
+setuptools==70.0.0

From 22082dffe41e667251a309f4a04dc82694d32cd7 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Mon, 30 Sep 2024 10:09:52 +0800
Subject: [PATCH 14/33] refine docs

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Llava/README.md           |  7 +++++--
 .../quantization/auto_round/Phi-3-vision/README.md    | 11 +++--------
 .../quantization/auto_round/Qwen-VL/README.md         |  4 +++-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
index 367c3b96700..0efe683a67c 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
@@ -24,7 +24,7 @@ pip install --upgrade pip  # enable PEP 660 support
 pip install -e .
 ```
 
-## Download the calibration data
+## Download the calibration/Evaluation data
 
 Our calibration process resembles the official visual instruction tuning process. To align the official implementation of [LLaVA](https://github.com/haotian-liu/LLaVA/tree/main?tab=readme-ov-file#visual-instruction-tuning)
 
@@ -32,6 +32,8 @@ Please download the annotation of the final mixture our instruction tuning data
 
 COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip the image folder to any directory you desire.
 
+Please refer to [llava_eval_datasets]https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md#scripts to download the textVQA dataset for evaluation usage
+
 <br />
 
 ## 2. Run Examples
@@ -43,7 +45,7 @@ pip install -r requirements.txt
 
 - **Default Settings:**
 ```bash
-CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name liuhaotian/llava-v1.5-7b  --bits 4 --group_size 128
+CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name liuhaotian/llava-v1.5-7b  --bits 4 --group_size 128 --quantize
 ```
 
 ## 3. Results
@@ -77,3 +79,4 @@ If you find SignRound useful for your research, please cite our paper:
 ```
 
 
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
index 6ca61c1ce01..8a7a109cedc 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
@@ -25,7 +25,7 @@ pip install -r requirements.txt
 
 - **Default Settings:**
 ```bash
-CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name microsoft/Phi-3-vision-128k-instruct  --bits 4 --group_size 128
+CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name microsoft/Phi-3-vision-128k-instruct  --bits 4 --group_size 128  --quantize
 ```
 
 
@@ -94,7 +94,8 @@ Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://hu
 | ceval          | 0.4027 | 0.4012 |
 | gsm8k          | 0.7157 | 0.6755 | -->
 
-
+## 4. Known Issues
+* The Flashattention2 component that Phi3-Vision relies on is **not supported on cpu devices**.
 
 ## Reference
 If you find SignRound useful for your research, please cite our paper:
@@ -107,9 +108,3 @@ If you find SignRound useful for your research, please cite our paper:
 }
 ```
 
-
-
-
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md
index 9c9f729e3ee..c32dc1cfade 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md
@@ -78,7 +78,7 @@ pip install -r requirements.txt
 
 - **Default Settings:**
 ```bash
-CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name Qwen/Qwen-VL  --bits 4 --group_size 128
+CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name Qwen/Qwen-VL  --bits 4 --group_size 128  --quantize
 ```
 
 
@@ -141,6 +141,7 @@ Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://hu
 | textVQA        | 0.6402 | 0.6379 |
 | scienceVQA     | 0.6748 | 0.6574 |
 
+
 ## 5. Known Issues
 * 'QWenTokenizer' object has no attribute 'IMAGE_ST'
 
@@ -176,3 +177,4 @@ If you find SignRound useful for your research, please cite our paper:
 
 
 
+

From a3b381d0450999f48a22206b2404a75aea52adad Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Tue, 8 Oct 2024 11:31:01 +0800
Subject: [PATCH 15/33] set attn_implementation for Phi3-vision

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Phi-3-vision/main.py     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
index 432afc35d9e..c7db5b17498 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
@@ -318,17 +318,17 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
             if args.model_dtype == "bfloat16" or args.model_dtype == "bfp16":
                 torch_dtype = torch.bfloat16
                 
-        config = AutoConfig.from_pretrained(
-            model_name,
-            trust_remote_code=True,
-        )
-        config.use_cache = False
+        # config = AutoConfig.from_pretrained(
+        #     model_name,
+        #     trust_remote_code=not args.disable_trust_remote_code,
+        # )
+        # config.use_cache = False
         
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             torch_dtype=torch_dtype,
             trust_remote_code=not args.disable_trust_remote_code,
-            config=config,
+            _attn_implementation='eager' # _attn_implementation='flash_attention_2' to enable flash attention
         )
         seqlen = args.seqlen
         processor = Phi3VProcessor.from_pretrained(model_name)

From b827c11fbac97a6a1c3786b15449aeff3b641a4f Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Tue, 8 Oct 2024 14:48:05 +0800
Subject: [PATCH 16/33] refine phi3 example

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../auto_round/Phi-3-vision/README.md         |  6 +-
 neural_compressor/torch/utils/utility.py      | 68 +++++++++----------
 2 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
index 8a7a109cedc..f618812d11d 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
@@ -1,4 +1,4 @@
-Step-by-Step
+iStep-by-Step
 ============
 This document describes the step-by-step instructions to run [VLM quantization for Phi3-Vision](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) using AutoRound Quantization.
 
@@ -94,8 +94,8 @@ Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://hu
 | ceval          | 0.4027 | 0.4012 |
 | gsm8k          | 0.7157 | 0.6755 | -->
 
-## 4. Known Issues
-* The Flashattention2 component that Phi3-Vision relies on is **not supported on cpu devices**.
+<!-- ## 4. Known Issues
+* The Flashattention2 component that Phi3-Vision relies on is **not supported on cpu devices**. -->
 
 ## Reference
 If you find SignRound useful for your research, please cite our paper:
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index dbd235cbd58..809e1859c89 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -15,13 +15,10 @@
 
 
 import enum
-import importlib
-from collections import UserDict
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import psutil
 import torch
-import transformers
 from typing_extensions import TypeAlias
 
 from neural_compressor.common.utils import (
@@ -32,6 +29,9 @@
     detect_processor_type_based_on_hw,
     logger,
 )
+import transformers
+from collections import UserDict
+import importlib
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -49,7 +49,8 @@
 
 
 def is_optimum_habana_available():
-    """Checks if the Optimum Habana module is available for use with the transformers library.
+    """
+    Checks if the Optimum Habana module is available for use with the transformers library.
 
     This function checks two conditions:
     1. If the `optimum` package is available using `transformers.utils.import_utils.is_optimum_available`.
@@ -59,7 +60,7 @@ def is_optimum_habana_available():
         bool: True if Optimum Habana is available, False otherwise.
     """
     from transformers.utils.import_utils import is_optimum_available
-
+    
     return is_optimum_available() and importlib.util.find_spec("optimum.habana") is not None
 
 
@@ -373,9 +374,8 @@ def get_module(module, key):
     return module
 
 
-def get_layer_names_in_block(
-    model, supported_types=[torch.nn.Linear, transformers.modeling_utils.Conv1D], quant_block_list=None
-):
+def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
+                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -402,7 +402,7 @@ def get_layer_names_in_block(
     return layers_in_block
 
 
-def to_dtype(input, dtype=torch.float32):
+def to_dtype(input, dtype=torch.float32): # pragma: no cover
     """Moves input data to the specified data type.
 
     Args:
@@ -444,14 +444,13 @@ def to_device(input, device=torch.device("cpu")):
     Returns:
     The input data on the specified device.
     """
-    if input is None:
+    if input is None: # pragma: no cover
         return None
     if isinstance(input, torch.Tensor):
         return input.to(device)
-    if isinstance(input, dict) or isinstance(input, UserDict):
+    if isinstance(input, dict) or isinstance(input, UserDict): # pragma: no cover
         for inp in input.keys():
             input[inp] = to_device(input[inp], device)
-
     elif isinstance(input, list) or isinstance(input, tuple):
         if len(input) == 0:
             return input
@@ -488,7 +487,8 @@ def get_block_names(model):
 
 
 def validate_modules(module_names):
-    """Test a list of modules' validity.
+    """
+    Test a list of modules' validity.
 
     Args:
     modules (list of str): List of strings to be validated.
@@ -497,10 +497,10 @@ def validate_modules(module_names):
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
     if not bool(module_names):  # pragma: no cover
-        raise ValueError("Empty modules")
+        raise ValueError(f"Empty modules")
     if len(module_names) < 2:
         return True
-    split_modules = [s.split(".") for s, _ in module_names]
+    split_modules = [s.split('.') for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
     if len(set(lengths)) == 1:  # pragma: no cover
         return True
@@ -508,13 +508,12 @@ def validate_modules(module_names):
     min_length = min(lengths)
     longest_module = next(s for s in split_modules if len(s) == max_length)
     shortest_module = next(s for s in split_modules if len(s) == min_length)
-    shortest_module = ".".join(shortest_module)
-    longest_module = ".".join(longest_module)
+    shortest_module = '.'.join(shortest_module)
+    longest_module = '.'.join(longest_module)
     # Check if the shortest name is a substring of the longest name
     if shortest_module in longest_module:  # pragma: no cover
-        raise ValueError(
-            "Invalid modules, at least two modules detected" " as dependent, {shortest_module} and {longest_module}"
-        )
+        raise ValueError(f"Invalid modules, at least two modules detected" \
+                         " as dependent, {shortest_module} and {longest_module}")
     return True
 
 
@@ -529,10 +528,7 @@ def get_multimodal_block_names(model, quant_vision=False):
     """
     block_names = []
     target_modules = []
-    Vison_blocks_tuple = (
-        "vision",
-        "visual",
-    )
+    Vison_blocks_tuple = ("vision", "visual",)
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
@@ -545,20 +541,20 @@ def get_multimodal_block_names(model, quant_vision=False):
     return block_names
 
 
-def detect_device(device=None):
-    """Detects the device to use for model execution (GPU, HPU, or CPU).
+def detect_device(device=None): # pragma: no cover
+    """
+    Detects the device to use for model execution (GPU, HPU, or CPU).
 
     Args:
-        device (str, int, torch.device, optional):
+        device (str, int, torch.device, optional): 
             - If a string ('cuda', 'cpu', or 'hpu') or torch.device is provided, that device is selected.
             - If an integer is provided, it treats it as a GPU device index.
-            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available,
+            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available, 
               or falls back to 'cpu'.
 
     Returns:
         str: The selected device in string format ('cuda:X', 'hpu', or 'cpu').
     """
-
     def is_valid_digit(s):
         try:
             num = int(s)
@@ -574,7 +570,7 @@ def is_valid_digit(s):
         if torch.cuda.is_available():
             device = torch.device("cuda")
             print("Using GPU device")
-        elif is_optimum_habana_available():  # pragma: no cover
+        elif is_optimum_habana_available():
             device = torch.device("hpu")
             print("Using HPU device")
         # Use CPU as a fallback
@@ -590,7 +586,8 @@ def is_valid_digit(s):
 
 
 def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
-    """Runs a model on a provided dataset with automatic device detection for vector-language models.
+    """
+    Runs a model on a provided dataset with automatic device detection for vector-language models.
 
     Args:
         model: The model to run.
@@ -608,18 +605,18 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
             input_ids = org_data.to(device)
             data = input_ids
         elif isinstance(org_data, tuple) or isinstance(org_data, list):
-            data = org_data
-            input_ids = data[0]
+                data = org_data
+                input_ids = data[0]
         else:
             data = {}
             for key in org_data.keys():
                 data[key] = to_device(org_data[key], device)
-                if key == "images":
+                if key == 'images':
                     data[key] = to_dtype(org_data[key], model.orig_model.dtype)
             input_ids = data["input_ids"]
         if input_ids.shape[-1] < seqlen:
             continue
-
+        
         if isinstance(data, tuple) or isinstance(data, list):
             model(*data)
         elif isinstance(data, dict):
@@ -629,3 +626,4 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
         total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
         if total_cnt >= nsamples:
             break
+

From 8767ffc4c2d02472dd5b79298523e64dffb354a4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 8 Oct 2024 06:50:46 +0000
Subject: [PATCH 17/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/utils/utility.py | 65 ++++++++++++------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index a43ae5f60a0..7ffa7e4beb0 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -15,10 +15,13 @@
 
 
 import enum
+import importlib
+from collections import UserDict
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import psutil
 import torch
+import transformers
 from typing_extensions import TypeAlias
 
 from neural_compressor.common.utils import (
@@ -29,9 +32,6 @@
     detect_processor_type_based_on_hw,
     logger,
 )
-import transformers
-from collections import UserDict
-import importlib
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -49,8 +49,7 @@
 
 
 def is_optimum_habana_available():
-    """
-    Checks if the Optimum Habana module is available for use with the transformers library.
+    """Checks if the Optimum Habana module is available for use with the transformers library.
 
     This function checks two conditions:
     1. If the `optimum` package is available using `transformers.utils.import_utils.is_optimum_available`.
@@ -60,7 +59,7 @@ def is_optimum_habana_available():
         bool: True if Optimum Habana is available, False otherwise.
     """
     from transformers.utils.import_utils import is_optimum_available
-    
+
     return is_optimum_available() and importlib.util.find_spec("optimum.habana") is not None
 
 
@@ -374,8 +373,9 @@ def get_module(module, key):
     return module
 
 
-def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
-                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
+def get_layer_names_in_block(
+    model, supported_types=[torch.nn.Linear, transformers.modeling_utils.Conv1D], quant_block_list=None
+):
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -402,7 +402,7 @@ def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
     return layers_in_block
 
 
-def to_dtype(input, dtype=torch.float32): # pragma: no cover
+def to_dtype(input, dtype=torch.float32):  # pragma: no cover
     """Moves input data to the specified data type.
 
     Args:
@@ -444,11 +444,11 @@ def to_device(input, device=torch.device("cpu")):
     Returns:
     The input data on the specified device.
     """
-    if input is None: # pragma: no cover
+    if input is None:  # pragma: no cover
         return None
     if isinstance(input, torch.Tensor):
         return input.to(device)
-    if isinstance(input, dict) or isinstance(input, UserDict): # pragma: no cover
+    if isinstance(input, dict) or isinstance(input, UserDict):  # pragma: no cover
         for inp in input.keys():
             input[inp] = to_device(input[inp], device)
     elif isinstance(input, list) or isinstance(input, tuple):
@@ -487,8 +487,7 @@ def get_block_names(model):
 
 
 def validate_modules(module_names):
-    """
-    Test a list of modules' validity.
+    """Test a list of modules' validity.
 
     Args:
     modules (list of str): List of strings to be validated.
@@ -497,10 +496,10 @@ def validate_modules(module_names):
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
     if not bool(module_names):  # pragma: no cover
-        raise ValueError(f"Empty modules")
+        raise ValueError("Empty modules")
     if len(module_names) < 2:
         return True
-    split_modules = [s.split('.') for s, _ in module_names]
+    split_modules = [s.split(".") for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
     if len(set(lengths)) == 1:  # pragma: no cover
         return True
@@ -508,12 +507,13 @@ def validate_modules(module_names):
     min_length = min(lengths)
     longest_module = next(s for s in split_modules if len(s) == max_length)
     shortest_module = next(s for s in split_modules if len(s) == min_length)
-    shortest_module = '.'.join(shortest_module)
-    longest_module = '.'.join(longest_module)
+    shortest_module = ".".join(shortest_module)
+    longest_module = ".".join(longest_module)
     # Check if the shortest name is a substring of the longest name
     if shortest_module in longest_module:  # pragma: no cover
-        raise ValueError(f"Invalid modules, at least two modules detected" \
-                         " as dependent, {shortest_module} and {longest_module}")
+        raise ValueError(
+            "Invalid modules, at least two modules detected" " as dependent, {shortest_module} and {longest_module}"
+        )
     return True
 
 
@@ -528,7 +528,10 @@ def get_multimodal_block_names(model, quant_vision=False):
     """
     block_names = []
     target_modules = []
-    Vison_blocks_tuple = ("vision", "visual",)
+    Vison_blocks_tuple = (
+        "vision",
+        "visual",
+    )
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
@@ -541,20 +544,20 @@ def get_multimodal_block_names(model, quant_vision=False):
     return block_names
 
 
-def detect_device(device=None): # pragma: no cover
-    """
-    Detects the device to use for model execution (GPU, HPU, or CPU).
+def detect_device(device=None):  # pragma: no cover
+    """Detects the device to use for model execution (GPU, HPU, or CPU).
 
     Args:
-        device (str, int, torch.device, optional): 
+        device (str, int, torch.device, optional):
             - If a string ('cuda', 'cpu', or 'hpu') or torch.device is provided, that device is selected.
             - If an integer is provided, it treats it as a GPU device index.
-            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available, 
+            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available,
               or falls back to 'cpu'.
 
     Returns:
         str: The selected device in string format ('cuda:X', 'hpu', or 'cpu').
     """
+
     def is_valid_digit(s):
         try:
             num = int(s)
@@ -586,8 +589,7 @@ def is_valid_digit(s):
 
 
 def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
-    """
-    Runs a model on a provided dataset with automatic device detection for vector-language models.
+    """Runs a model on a provided dataset with automatic device detection for vector-language models.
 
     Args:
         model: The model to run.
@@ -605,18 +607,18 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
             input_ids = org_data.to(device)
             data = input_ids
         elif isinstance(org_data, tuple) or isinstance(org_data, list):
-                data = org_data
-                input_ids = data[0]
+            data = org_data
+            input_ids = data[0]
         else:
             data = {}
             for key in org_data.keys():
                 data[key] = to_device(org_data[key], device)
-                if key == 'images':
+                if key == "images":
                     data[key] = to_dtype(org_data[key], model.orig_model.dtype)
             input_ids = data["input_ids"]
         if input_ids.shape[-1] < seqlen:
             continue
-        
+
         if isinstance(data, tuple) or isinstance(data, list):
             model(*data)
         elif isinstance(data, dict):
@@ -626,4 +628,3 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
         total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
         if total_cnt >= nsamples:
             break
-

From afbfcfa0787ba5e83b799d1ce13e16a3389a8f0f Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Tue, 8 Oct 2024 17:10:51 +0800
Subject: [PATCH 18/33] fix code coverage

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 neural_compressor/torch/utils/utility.py      | 16 +++++++--------
 .../weight_only/test_autoround.py             | 20 +++++++++----------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index a43ae5f60a0..832e6fb2ed1 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -434,7 +434,7 @@ def to_dtype(input, dtype=torch.float32): # pragma: no cover
 
 
 # for VLM usage
-def to_device(input, device=torch.device("cpu")):
+def to_device(input, device=torch.device("cpu")):  # pragma: no cover
     """Moves input data to the specified device.
 
     Args:
@@ -444,11 +444,11 @@ def to_device(input, device=torch.device("cpu")):
     Returns:
     The input data on the specified device.
     """
-    if input is None: # pragma: no cover
+    if input is None:
         return None
     if isinstance(input, torch.Tensor):
         return input.to(device)
-    if isinstance(input, dict) or isinstance(input, UserDict): # pragma: no cover
+    if isinstance(input, dict) or isinstance(input, UserDict):
         for inp in input.keys():
             input[inp] = to_device(input[inp], device)
     elif isinstance(input, list) or isinstance(input, tuple):
@@ -486,7 +486,7 @@ def get_block_names(model):
     return block_names
 
 
-def validate_modules(module_names):
+def validate_modules(module_names): # pragma: no cover
     """
     Test a list of modules' validity.
 
@@ -496,13 +496,13 @@ def validate_modules(module_names):
     Returns:
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
-    if not bool(module_names):  # pragma: no cover
+    if not bool(module_names):  
         raise ValueError(f"Empty modules")
     if len(module_names) < 2:
         return True
     split_modules = [s.split('.') for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
-    if len(set(lengths)) == 1:  # pragma: no cover
+    if len(set(lengths)) == 1:
         return True
     max_length = max(lengths)
     min_length = min(lengths)
@@ -511,7 +511,7 @@ def validate_modules(module_names):
     shortest_module = '.'.join(shortest_module)
     longest_module = '.'.join(longest_module)
     # Check if the shortest name is a substring of the longest name
-    if shortest_module in longest_module:  # pragma: no cover
+    if shortest_module in longest_module:
         raise ValueError(f"Invalid modules, at least two modules detected" \
                          " as dependent, {shortest_module} and {longest_module}")
     return True
@@ -585,7 +585,7 @@ def is_valid_digit(s):
     return device
 
 
-def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):
+def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):  # pragma: no cover
     """
     Runs a model on a provided dataset with automatic device detection for vector-language models.
 
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 79245345cbb..d06f29c7f8a 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -163,21 +163,17 @@ def test_conv1d(self):
         assert isinstance(q_model.h[0].attn.c_attn, WeightOnlyLinear), "loading compressed model failed."
 
     def test_utils(self):
-        from neural_compressor.torch.utils.utility import (
-            detect_device,
-            get_layer_names_in_block,
-            get_multimodal_block_names,
-            run_fn_for_vlm_autoround,
-        )
-
+        from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
+                                                            get_layer_names_in_block,
+                                                            detect_device,
+                                                            run_fn_for_vlm_autoround)
         fp32_model = copy.deepcopy(self.gptj)
         quant_block_list = get_multimodal_block_names(fp32_model, quant_vision=True)
-        quant_config = AutoRoundConfig(
-            nsamples=32, seqlen=10, iters=10, scale_dtype="fp16", quant_block_list=quant_block_list
-        )
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp16", quant_block_list=quant_block_list)
         logger.info(f"Test AutoRound with config {quant_config}")
         device = detect_device("auto")
         layers_list = get_layer_names_in_block(fp32_model, quant_block_list=quant_block_list)
+        layers_list = get_layer_names_in_block(fp32_model)
         fp32_model.to(device)
         # quantizer execute
         model = prepare(model=fp32_model, quant_config=quant_config)
@@ -186,7 +182,8 @@ def test_utils(self):
         out = q_model(self.inp)[0]
         assert torch.allclose(out, self.label, atol=1e-1)
         assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
-
+        
+        
     # def test_autoround_format_export(self):
     #     from neural_compressor.torch.quantization import load
     #     from auto_gptq.nn_modules.qlinear.qlinear_triton import QuantLinear
@@ -201,3 +198,4 @@ def test_utils(self):
     #     assert isinstance(q_model.transformer.h[0].attn.k_proj, QuantLinear), "packing model failed."
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
+

From 6b8cc73feab7d7ac4bc834bfd65193bc28ca7c9d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 8 Oct 2024 09:16:45 +0000
Subject: [PATCH 19/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/utils/utility.py      | 65 ++++++++++---------
 .../weight_only/test_autoround.py             | 19 +++---
 2 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index 832e6fb2ed1..2964983440d 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -15,10 +15,13 @@
 
 
 import enum
+import importlib
+from collections import UserDict
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import psutil
 import torch
+import transformers
 from typing_extensions import TypeAlias
 
 from neural_compressor.common.utils import (
@@ -29,9 +32,6 @@
     detect_processor_type_based_on_hw,
     logger,
 )
-import transformers
-from collections import UserDict
-import importlib
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -49,8 +49,7 @@
 
 
 def is_optimum_habana_available():
-    """
-    Checks if the Optimum Habana module is available for use with the transformers library.
+    """Checks if the Optimum Habana module is available for use with the transformers library.
 
     This function checks two conditions:
     1. If the `optimum` package is available using `transformers.utils.import_utils.is_optimum_available`.
@@ -60,7 +59,7 @@ def is_optimum_habana_available():
         bool: True if Optimum Habana is available, False otherwise.
     """
     from transformers.utils.import_utils import is_optimum_available
-    
+
     return is_optimum_available() and importlib.util.find_spec("optimum.habana") is not None
 
 
@@ -374,8 +373,9 @@ def get_module(module, key):
     return module
 
 
-def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
-                                                     transformers.modeling_utils.Conv1D], quant_block_list=None):
+def get_layer_names_in_block(
+    model, supported_types=[torch.nn.Linear, transformers.modeling_utils.Conv1D], quant_block_list=None
+):
     """Retrieves the names of layers within each block of the model.
 
     Returns:
@@ -402,7 +402,7 @@ def get_layer_names_in_block(model, supported_types=[torch.nn.Linear,
     return layers_in_block
 
 
-def to_dtype(input, dtype=torch.float32): # pragma: no cover
+def to_dtype(input, dtype=torch.float32):  # pragma: no cover
     """Moves input data to the specified data type.
 
     Args:
@@ -486,9 +486,8 @@ def get_block_names(model):
     return block_names
 
 
-def validate_modules(module_names): # pragma: no cover
-    """
-    Test a list of modules' validity.
+def validate_modules(module_names):  # pragma: no cover
+    """Test a list of modules' validity.
 
     Args:
     modules (list of str): List of strings to be validated.
@@ -496,11 +495,11 @@ def validate_modules(module_names): # pragma: no cover
     Returns:
     bool: True if all modules have equal length or not dependent, otherwise False.
     """
-    if not bool(module_names):  
-        raise ValueError(f"Empty modules")
+    if not bool(module_names):
+        raise ValueError("Empty modules")
     if len(module_names) < 2:
         return True
-    split_modules = [s.split('.') for s, _ in module_names]
+    split_modules = [s.split(".") for s, _ in module_names]
     lengths = [len(parts) for parts in split_modules]
     if len(set(lengths)) == 1:
         return True
@@ -508,12 +507,13 @@ def validate_modules(module_names): # pragma: no cover
     min_length = min(lengths)
     longest_module = next(s for s in split_modules if len(s) == max_length)
     shortest_module = next(s for s in split_modules if len(s) == min_length)
-    shortest_module = '.'.join(shortest_module)
-    longest_module = '.'.join(longest_module)
+    shortest_module = ".".join(shortest_module)
+    longest_module = ".".join(longest_module)
     # Check if the shortest name is a substring of the longest name
     if shortest_module in longest_module:
-        raise ValueError(f"Invalid modules, at least two modules detected" \
-                         " as dependent, {shortest_module} and {longest_module}")
+        raise ValueError(
+            "Invalid modules, at least two modules detected" " as dependent, {shortest_module} and {longest_module}"
+        )
     return True
 
 
@@ -528,7 +528,10 @@ def get_multimodal_block_names(model, quant_vision=False):
     """
     block_names = []
     target_modules = []
-    Vison_blocks_tuple = ("vision", "visual",)
+    Vison_blocks_tuple = (
+        "vision",
+        "visual",
+    )
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             if quant_vision or all(key not in n.lower() for key in (Vison_blocks_tuple)):
@@ -541,20 +544,20 @@ def get_multimodal_block_names(model, quant_vision=False):
     return block_names
 
 
-def detect_device(device=None): # pragma: no cover
-    """
-    Detects the device to use for model execution (GPU, HPU, or CPU).
+def detect_device(device=None):  # pragma: no cover
+    """Detects the device to use for model execution (GPU, HPU, or CPU).
 
     Args:
-        device (str, int, torch.device, optional): 
+        device (str, int, torch.device, optional):
             - If a string ('cuda', 'cpu', or 'hpu') or torch.device is provided, that device is selected.
             - If an integer is provided, it treats it as a GPU device index.
-            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available, 
+            - If None or 'auto', it automatically selects 'cuda' if available, 'hpu' if Habana is available,
               or falls back to 'cpu'.
 
     Returns:
         str: The selected device in string format ('cuda:X', 'hpu', or 'cpu').
     """
+
     def is_valid_digit(s):
         try:
             num = int(s)
@@ -586,8 +589,7 @@ def is_valid_digit(s):
 
 
 def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):  # pragma: no cover
-    """
-    Runs a model on a provided dataset with automatic device detection for vector-language models.
+    """Runs a model on a provided dataset with automatic device detection for vector-language models.
 
     Args:
         model: The model to run.
@@ -605,18 +607,18 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):  # pr
             input_ids = org_data.to(device)
             data = input_ids
         elif isinstance(org_data, tuple) or isinstance(org_data, list):
-                data = org_data
-                input_ids = data[0]
+            data = org_data
+            input_ids = data[0]
         else:
             data = {}
             for key in org_data.keys():
                 data[key] = to_device(org_data[key], device)
-                if key == 'images':
+                if key == "images":
                     data[key] = to_dtype(org_data[key], model.orig_model.dtype)
             input_ids = data["input_ids"]
         if input_ids.shape[-1] < seqlen:
             continue
-        
+
         if isinstance(data, tuple) or isinstance(data, list):
             model(*data)
         elif isinstance(data, dict):
@@ -626,4 +628,3 @@ def run_fn_for_vlm_autoround(model, dataloader, seqlen=512, nsamples=512):  # pr
         total_cnt += input_ids.shape[0] if len(input_ids.shape) > 1 else 1
         if total_cnt >= nsamples:
             break
-
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index d06f29c7f8a..08a0bdf83e3 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -163,13 +163,18 @@ def test_conv1d(self):
         assert isinstance(q_model.h[0].attn.c_attn, WeightOnlyLinear), "loading compressed model failed."
 
     def test_utils(self):
-        from neural_compressor.torch.utils.utility import (get_multimodal_block_names,
-                                                            get_layer_names_in_block,
-                                                            detect_device,
-                                                            run_fn_for_vlm_autoround)
+        from neural_compressor.torch.utils.utility import (
+            detect_device,
+            get_layer_names_in_block,
+            get_multimodal_block_names,
+            run_fn_for_vlm_autoround,
+        )
+
         fp32_model = copy.deepcopy(self.gptj)
         quant_block_list = get_multimodal_block_names(fp32_model, quant_vision=True)
-        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp16", quant_block_list=quant_block_list)
+        quant_config = AutoRoundConfig(
+            nsamples=32, seqlen=10, iters=10, scale_dtype="fp16", quant_block_list=quant_block_list
+        )
         logger.info(f"Test AutoRound with config {quant_config}")
         device = detect_device("auto")
         layers_list = get_layer_names_in_block(fp32_model, quant_block_list=quant_block_list)
@@ -182,8 +187,7 @@ def test_utils(self):
         out = q_model(self.inp)[0]
         assert torch.allclose(out, self.label, atol=1e-1)
         assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
-        
-        
+
     # def test_autoround_format_export(self):
     #     from neural_compressor.torch.quantization import load
     #     from auto_gptq.nn_modules.qlinear.qlinear_triton import QuantLinear
@@ -198,4 +202,3 @@ def test_utils(self):
     #     assert isinstance(q_model.transformer.h[0].attn.k_proj, QuantLinear), "packing model failed."
     #     q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
     #     loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
-

From 9555321c28b904ae08ebf9efd1030fd108f46764 Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao" <xuehao.sun@intel.com>
Date: Wed, 9 Oct 2024 14:31:56 +0800
Subject: [PATCH 20/33] update config

Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index 795861e2d36..bb18153f389 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -2,21 +2,24 @@
   "pytorch": {
     "llava_woq_autoround_int4":{
       "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava",
-      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
       "input_model": "liuhaotian/llava-v1.5-7b",
       "main_script": "main.py",
       "batch_size": 1
     },
     "qwenvl_woq_autoround_int4":{
       "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
-      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
       "input_model": "Qwen/Qwen-VL",
       "main_script": "main.py",
       "batch_size": 8
     },
     "Phi3Vision_woq_autoround_int4":{
       "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision",
-      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
       "input_model": "microsoft/Phi-3-vision-128k-instruct",
       "main_script": "main.py",
       "batch_size": 1

From 5dcb9bd6eb02a5025c8266f7ac8d64443df98a8d Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Wed, 16 Oct 2024 00:06:42 +0800
Subject: [PATCH 21/33] refine shells, docs and example. enable qwen2-vl
 quantization

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Llava/README.md   |   2 +-
 .../auto_round/Llava/run_autoround.sh         |  65 ++++++---
 .../quantization/auto_round/Llava/run_eval.sh |  51 +++++++
 .../auto_round/Phi-3-vision/README.md         |   9 +-
 .../auto_round/Phi-3-vision/main.py           |   6 +-
 .../auto_round/Phi-3-vision/run_autoround.sh  |  54 ++++++--
 .../auto_round/Qwen-VL/run_autoround.sh       |  19 ---
 .../auto_round/Qwen-VL/run_eval.sh            |  15 --
 .../{Qwen-VL => common_model}/README.md       | 118 +++++++++++-----
 .../{Qwen-VL => common_model}/main.py         | 130 +++++++++++++-----
 .../mm_evaluation/__init__.py                 |   0
 .../mm_evaluation/evaluate_multiple_choice.py |   0
 .../mm_evaluation/evaluate_vqa.py             |   0
 .../mm_evaluation/main.py                     |   0
 .../mm_evaluation/vqa.py                      |   0
 .../mm_evaluation/vqa_eval.py                 |   0
 .../requirements.txt                          |   7 +-
 .../auto_round/common_model/run_autoround.sh  |  52 +++++++
 .../auto_round/common_model/run_eval.sh       |  36 +++++
 .../torch/algorithms/weight_only/save_load.py |  20 ++-
 20 files changed, 442 insertions(+), 142 deletions(-)
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
 delete mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
 delete mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_eval.sh
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/README.md (59%)
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/main.py (84%)
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/mm_evaluation/__init__.py (100%)
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/mm_evaluation/evaluate_multiple_choice.py (100%)
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/mm_evaluation/evaluate_vqa.py (100%)
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/mm_evaluation/main.py (100%)
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/mm_evaluation/vqa.py (100%)
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/mm_evaluation/vqa_eval.py (100%)
 rename examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/{Qwen-VL => common_model}/requirements.txt (64%)
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_autoround.sh
 create mode 100644 examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
index 0efe683a67c..0c28948fa48 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/README.md
@@ -32,7 +32,7 @@ Please download the annotation of the final mixture our instruction tuning data
 
 COCO: [train2017](http://images.cocodataset.org/zips/train2017.zip), and unzip the image folder to any directory you desire.
 
-Please refer to [llava_eval_datasets]https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md#scripts to download the textVQA dataset for evaluation usage
+Please refer to [llava_eval_datasets](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md#scripts) to download the textVQA dataset for evaluation usage
 
 <br />
 
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
index 19b07526e26..991ee772610 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_autoround.sh
@@ -1,21 +1,52 @@
 #!/bin/bash
 set -x
-device=0
-
-CUDA_VISIBLE_DEVICES=$device \
-python3 main.py \
---model_name=liuhaotian/llava-v1.5-7b \
---bits 4 \
---group_size 128 \
---iters 200 \
---seqlen 512 \
---quantize \
---image_folder /path/to/coco/images/train2017/ \
---question_file /path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k.json \
---eval-question-file /path/to/textvqa/llava_textvqa_val_v051_ocr.jsonl \
---eval-image-folder /path/to/textvqa/train_images \
---eval-annotation-file /path/to/textvqa/TextVQA_0.5.1_val.json \
---eval-result-file "./tmp_autoround" \
---output_dir "./tmp_autoround"
 
+function main {
 
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --model_name=*)
+          model_name=$(echo $var |cut -f2 -d=)
+      ;;
+      --image_folder=*)
+          image_folder=$(echo $var |cut -f2 -d=)
+      ;;
+      --question_file=*)
+          question_file=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_dir=*)
+          output_dir=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    python main.py \
+            --model_name ${model_name} \
+            --bits 4 \
+            --group_size 128 \
+            --iters 200 \
+            --seqlen 512 \
+            --quantize \
+            --image_folder ${image_folder} \
+            --question_file ${question_file} \
+            --output_dir ${output_dir}
+}
+
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
new file mode 100644
index 00000000000..cce4c8c5a3b
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_evaluation
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --model_name=*)
+          model_name=$(echo $var |cut -f2 -d=)
+      ;;
+      --eval-question-file=*)
+          eval-question-file=$(echo $var |cut -f2 -d=)
+      ;;
+      --eval-image-folder=*)
+          eval-image-folder=$(echo $var |cut -f2 -d=)
+      ;;
+      --eval-annotation-file=*)
+          eval-annotation-file=$(echo $var |cut -f2 -d=)
+      ;;
+      --eval-result-file=*)
+          eval-result-file=$(echo $var |cut -f2 -d=)
+      ;;  
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_evaluation
+function run_evaluation {
+    python mm_evaluation/textvqa.py \
+            --model_name ${model_name} \
+            --eval-question-file ${eval-question-file} \
+            --eval-image-folder ${eval-image-folder} \
+            --eval-annotation-file ${eval-annotation-file} \
+            --eval-result-file ${eval-result-file} \
+            --trust_remote_code \
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
index f618812d11d..930ec7963d7 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/README.md
@@ -1,4 +1,4 @@
-iStep-by-Step
+Step-by-Step
 ============
 This document describes the step-by-step instructions to run [VLM quantization for Phi3-Vision](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) using AutoRound Quantization.
 
@@ -37,11 +37,12 @@ import requests
 import io
 from transformers import AutoModelForCausalLM
 from transformers import AutoProcessor
-from auto_round.auto_quantizer import AutoHfQuantizer
+from neural_compressor.torch.quantization import load
 quantized_model_path = "./tmp_autoround"
-model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2') # use _attn_implementation='eager' to disable flash attention
+model = load(quantized_model_path, format='huggingface', \
+             trust_remote_code=True, device_map="auto", torch_dtype="auto", _attn_implementation='eager') # use _attn_implementation='flash_attention_2' to enable flash attention
 
-processor = AutoProcessor.from_pretrained(quantized_model_path, trust_remote_code=True)
+processor = AutoProcessor.from_pretrained("microsoft/Phi-3-vision-128k-instruct", trust_remote_code=True)
 
 messages = [ \
     {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"}, \
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
index c7db5b17498..b2e2c945ed2 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/main.py
@@ -402,6 +402,8 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
         user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE, safe_serialization=False)
         if tokenizer is not None:
             tokenizer.save_pretrained(args.output_dir)
+        if processor is not None:
+            processor.save_pretrained(args.output_dir)
 
     
     # if args.accuracy:
@@ -412,7 +414,9 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
     #         tasks = tasks.split(',')
     #     model_args = f"pretrained={args.model_name}"
     #     model_args = model_args + f",trust_remote_code={not args.disable_trust_remote_code}"
-    #     user_model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code)
+    #     model_args += f",autogptq=True,gptq_use_triton=True"
+    #     user_model = load(args.model_name, format='huggingface', \
+    #                         trust_remote_code=not args.disable_trust_remote_code, _attn_implementation='eager')
     #     if args.act_bits <= 8:
     #         user_model = model.to(device_str)
 
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
index f7feeec9efd..9fe47669fb1 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Phi-3-vision/run_autoround.sh
@@ -1,14 +1,48 @@
 #!/bin/bash
 set -x
-device=0
-model_name=microsoft/Phi-3-vision-128k-instruct
-CUDA_VISIBLE_DEVICES=$device \
-python3 main.py \
---model_name=$model_name \
---nsamples 512 \
---quantize \
---image_folder /PATH/TO/coco/images/train2017 \
---question_file /PATH/TO/llava_v1_5_mix665k.json \
---output_dir "./tmp_autoround"
 
+function main {
 
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --model_name=*)
+          model_name=$(echo $var |cut -f2 -d=)
+      ;;
+      --image_folder=*)
+          image_folder=$(echo $var |cut -f2 -d=)
+      ;;
+      --question_file=*)
+          question_file=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_dir=*)
+          output_dir=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    python main.py \
+            --model_name ${model_name} \
+            --nsamples 512 \
+            --quantize \
+            --image_folder ${image_folder} \
+            --question_file ${question_file} \
+            --output_dir ${output_dir}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
deleted file mode 100644
index 47a282501ce..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_autoround.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-set -x
-device=0
-
-# --quant_vision    ## for vision quantization
-
-CUDA_VISIBLE_DEVICES=$device \
-python3 main.py \
---model_name=Qwen/Qwen-VL \
---bits 4 \
---group_size 128 \
---iters 200 \
---seqlen 512 \
---disable_quanted_input \
---quantize \
---image_folder /path/to/coco/images/train2017/ \
---question_file /path/to/Qwen-VL_mix665k.json \
---output_dir "./tmp_autoround"
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_eval.sh
deleted file mode 100644
index 49dc90a25f7..00000000000
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/run_eval.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -x
-device=0
-
-model_path='./tmp_autoround'
-model=Qwen-VL
-
-CUDA_VISIBLE_DEVICES=$device python3 mm_evaluation/main.py \
---model_name ${model_path}/${model} \
---trust_remote_code \
---eval_bs 4
-
-
-
-
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md
similarity index 59%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md
index c32dc1cfade..0f60a8cd684 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md
@@ -78,48 +78,102 @@ pip install -r requirements.txt
 
 - **Default Settings:**
 ```bash
-CUDA_VISIBLE_DEVICES=0 python3 main.py --model_name Qwen/Qwen-VL  --bits 4 --group_size 128  --quantize
+sh run_autoround.sh
 ```
 
 
 ## 3. run inference
 
 ```python
-  from transformers import AutoModelForCausalLM, AutoTokenizer
-  from transformers.generation import GenerationConfig
-  import torch
-  from transformers import set_seed
-  set_seed(1234)
-  from auto_round.auto_quantizer import AutoHfQuantizer
-  quantized_model_path = "./tmp_autoround"
-  tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, trust_remote_code=True)
-  # use bf16
-  model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, bf16=True).eval()
-  # use fp16
-  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True, fp16=True).eval()
-  # use cpu only
-  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu", trust_remote_code=True).eval()
-  # use cuda device
-  # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cuda", trust_remote_code=True).eval()
-  query = tokenizer.from_list_format([{'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, \
-      {'text': 'Generate the caption in English with grounding:'}, \
-  ])
-  inputs = tokenizer(query, return_tensors='pt')
-  inputs = inputs.to(model.device)
-  with torch.cuda.amp.autocast(): 
-      pred = model.generate(**inputs)
-  response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
-  print(response)
-  # <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
-  image = tokenizer.draw_bbox_on_latest_picture(response)
-  if image:
-    image.save('2.jpg')
-  else:
-    print("no box")
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torch
+from neural_compressor.torch.quantization import load
+from transformers import set_seed
+set_seed(1234)
+
+quantized_model_path = "./tmp_autoround"
+tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, trust_remote_code=True)
+model = load(quantized_model_path, format='huggingface', device_map="auto", trust_remote_code=True).eval()
+query = tokenizer.from_list_format([{'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, \
+    {'text': 'Generate the caption in English with grounding:'}, \
+])
+inputs = tokenizer(query, return_tensors='pt')
+inputs = inputs.to(model.device)
+with torch.cuda.amp.autocast(): 
+    pred = model.generate(**inputs)
+    
+response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+print(response)
+# <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
+image = tokenizer.draw_bbox_on_latest_picture(response)
+if image:
+image.save('2.jpg')
+else:
+print("no box")
 
 ```
 
 
+
+- Qwen2-VL-7B-Instruct inference
+
+```python
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from neural_compressor.torch.quantization import load
+quantized_model_path="./tmp_autoround"
+model = load(quantized_model_path, format='huggingface', device_map="auto",
+             trust_remote_code=True, model_class=Qwen2VLForConditionalGeneration)
+processor = AutoProcessor.from_pretrained(quantized_model_path)
+messages = [{
+    "role": "user",
+    "content": [
+        {
+            "type": "image",
+            "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+        },
+        {"type": "text", "text": "Describe this image."},]
+}]
+# Preparation for inference
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to(model.device)
+ 
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=50)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+# The image depicts a serene beach scene at sunset. A woman is sitting on the sand, facing a large dog that appears to be a Labrador Retriever. The dog is wearing a harness and is extending its paw towards the woman's hand, possibly
+
+# messages = [{
+#     "role": "user",
+#     "content": [
+#         {
+#             "type": "image",
+#             "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+#         },
+#         {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},]
+# }]
+
+# The label 15 represents an ash cloud. In the context of a volcano, an ash cloud is formed when volcanic ash is ejected into the atmosphere during an eruption. Therefore, the correct answer is:\n\n(4) ash cloud
+
+```
+
+
+
 ## 4. Results
 Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Qwen-VL are as follows:
 | Metric         | bf16   | INT4   |
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
similarity index 84%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
index 428e985f628..847be2a206b 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
@@ -32,6 +32,7 @@
                                                     convert,
                                                     load)
 
+
 def DataFormating(raw_data, image_folder=None, model_type='qwen'):
     for source in raw_data:
         source_inputs = source['conversations']
@@ -44,6 +45,8 @@ def DataFormating(raw_data, image_folder=None, model_type='qwen'):
                 sentence['value'] = sentence['value'].strip()
                 if 'qwen2' in model_type: # for Qwen2-vl
                     replace_token = '<|vision_start|><|image_pad|><|vision_end|>'
+                elif 'mllama' in model_type:
+                    replace_token = '<|image|>'
                 else:
                     replace_img = os.path.join(image_folder, os.path.basename(source["image"]))
                     replace_token = DEFAULT_IM_START_TOKEN + replace_img + DEFAULT_IM_END_TOKEN + '\n'
@@ -51,17 +54,25 @@ def DataFormating(raw_data, image_folder=None, model_type='qwen'):
     return raw_data
 
 
-def qwen2_preprocess(
+def common_preprocess(
     sources,
     tokenizer: transformers.PreTrainedTokenizer,
     max_len: int,
-    system_message: str = "You are a helpful assistant."
+    system_message: str = "You are a helpful assistant.",
+    model_type='qwen2'
 ) -> Dict:
-    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
-    im_start = "<|im_start|>"
-    im_end = "<|im_end|>"
+    if 'mllama' in model_type:
+        roles = {"user": "<|start_header_id|>user<|end_header_id|>\n", "assistant": "<|start_header_id|>assistant<|end_header_id|>\n"}
+        im_start = "<|start_header_id|>"
+        im_end = "<|end_header_id|>\n"
+        im_dot = '<|eot_id|>'
+        text_start = '<|begin_of_text|>'
+    else :
+        roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
+        im_start = "<|im_start|>"
+        im_end = "<|im_end|>"
     nl_tokens = '\n'
-    _system = 'system' + nl_tokens
+    _system = 'system'
 
     # Apply prompt templates
     inputs, targets = [], []
@@ -70,12 +81,19 @@ def qwen2_preprocess(
             source = source[1:]
 
         text, target = "", None
-        system = im_start + _system + system_message + im_end + nl_tokens
+        if 'mllama' in model_type:
+            system = text_start + im_start + _system + im_end + nl_tokens + system_message + im_dot
+        else:
+            system = im_start + _system + nl_tokens + system_message + im_end + nl_tokens
         text += system
         for j, sentence in enumerate(source):
             role = roles[sentence["from"]]
-            _text = role + nl_tokens + \
-                sentence["value"] + im_end + nl_tokens
+            if 'mllama' in model_type:
+                _text = role + nl_tokens + \
+                    sentence["value"] + im_dot
+            else:
+                _text = role + nl_tokens + \
+                    sentence["value"] + im_end + nl_tokens
             text += _text
         token_length = len(tokenizer(text).input_ids)
         if token_length < max_len:
@@ -144,6 +162,8 @@ def preprocess(
         attention_mask=input_ids.ne(tokenizer.pad_token_id),
     )
 
+
+
 class LazySupervisedDataset(Dataset):
     """Dataset for supervised fine-tuning."""
 
@@ -164,34 +184,46 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
         if i in self.cached_data_dict:
             return self.cached_data_dict[i]
 
-        if 'qwen2' not in model_type:
+        if 'qwen' == model_type: # for Qwen-VL
             ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
             ret = dict(
                 input_ids=ret["input_ids"][0],
                 labels=ret["labels"][0],
                 attention_mask=ret["attention_mask"][0],
             )
-        else:
-            texts = qwen2_preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
-            image_path = os.path.join(f"file://{self.image_folder}", os.path.basename(self.raw_data[i]["image"]))
-            image_inputs = fetch_image({'image':image_path})
+        else: # Qwen2-VL and Llama-3.2 
+            texts = common_preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len, model_type=model_type)
+            if 'qwen2' in model_type:
+                image_path = os.path.join(f"file://{self.image_folder}", os.path.basename(self.raw_data[i]["image"]))
+                image = fetch_image({'image':image_path})
+            else:
+                image = Image.open(os.path.join(self.image_folder, os.path.basename(self.raw_data[i]["image"]))) #.convert('RGB')
             ret = self.tokenizer.processor(
                 text=texts,
-                images=image_inputs,
-                videos=None,
+                images=image,
                 padding=True,
                 truncation=True,
                 return_tensors="pt",
+                # videos=None,
             )
-            ret = dict(
-                input_ids=ret["input_ids"][0],
-                # labels=ret["labels"][0],
-                attention_mask=ret["attention_mask"][0],
-                image_grid_thw=ret["image_grid_thw"][0],
-                pixel_values=ret["pixel_values"],
-            )
+            if 'qwen2' in model_type:
+                ret = dict(
+                    input_ids=ret["input_ids"][0],
+                    # labels=ret["labels"][0],
+                    attention_mask=ret["attention_mask"][0],
+                    image_grid_thw=ret["image_grid_thw"][0],
+                    pixel_values=ret["pixel_values"],
+                )
+            else:
+                ret = dict(
+                    input_ids=ret["input_ids"][0],
+                    attention_mask=ret["attention_mask"][0],
+                    aspect_ratio_ids=ret["aspect_ratio_ids"][0],
+                    aspect_ratio_mask=ret["aspect_ratio_mask"][0],
+                    cross_attention_mask=ret["cross_attention_mask"][0],
+                    pixel_values=ret["pixel_values"][0],
+                )
         self.cached_data_dict[i] = ret
-
         return ret
 
 
@@ -396,18 +428,14 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
         questions = json.load(open(args.question_file, "r"))
         config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
         model_type = config.model_type
-        if 'qwen2' not in model_type: # for Qwen-VL/Qwen-VL-Chat
-            tokenizer.pad_token_id = tokenizer.eod_id
-            config.use_cache = False
-            if dtype_str == "bf16":
-                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code, bf16=True).eval()
-            elif dtype_str == "fp16":
-                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code, fp16=True).eval()
-            else:
-                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code).eval()
-            # raw_data = DataFormating(questions, args.image_folder)
-            default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
-        else: # for Qwen2-VL-instruct
+        if "mllama" in model_type: #for Llama-3.2-11B-Vision-Instruct
+            from transformers import MllamaForConditionalGeneration, AutoProcessor
+            model = MllamaForConditionalGeneration.from_pretrained(args.model_name, 
+                                                                trust_remote_code=not args.disable_trust_remote_code) # torch_dtype=torch.bfloat16
+            processor = AutoProcessor.from_pretrained(args.model_name)
+            tokenizer.processor = processor
+            default_collator = default_data_collator
+        elif 'qwen2' in model_type: # for Qwen2-VL-instruct
             transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
             if transformers_version[0] == 4 and transformers_version[1] < 45:
                 error_message = "Please upgrade transformers to version >= 4.45 or the newest source code to support lm-head quantization."
@@ -418,6 +446,17 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
             processor = AutoProcessor.from_pretrained(args.model_name)
             tokenizer.processor = processor
             default_collator = default_data_collator
+        else: # for Qwen-VL/Qwen-VL-Chat
+            tokenizer.pad_token_id = tokenizer.eod_id
+            config.use_cache = False
+            if dtype_str == "bf16":
+                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code, bf16=True).eval()
+            elif dtype_str == "fp16":
+                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code, fp16=True).eval()
+            else:
+                model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=not args.disable_trust_remote_code).eval()
+            # raw_data = DataFormating(questions, args.image_folder)
+            default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
             
         raw_data = DataFormating(questions, args.image_folder, model_type=model_type)
         dataset = LazySupervisedDataset(raw_data, tokenizer,
@@ -468,8 +507,12 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
                     quant_config.set_local(n, AutoRoundConfig(dtype="fp32"))
                     print(
                         f"{n} will not be quantized due to its shape not being divisible by 32, resulting in an exporting issue to autogptq")
+                    
         # skip special layers
         quant_config.set_local("transformer.visual.attn_pool.*_proj", AutoRoundConfig(dtype="fp32"))
+        quant_config.set_local("multi_modal_projector", AutoRoundConfig(dtype="fp32"))
+        quant_config.set_local("visual.merger", AutoRoundConfig(dtype="fp32"))
+        
 
         if not args.quant_lm_head:
             quant_config.set_local(lm_head_layer_name, AutoRoundConfig(dtype="fp32"))
@@ -482,11 +525,13 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
         user_model = prepare(model=model, quant_config=quant_config)
         run_fn_for_vlm_autoround(user_model, *run_args)
         user_model = convert(user_model)
-
+        
         from neural_compressor.torch.utils import (LoadFormat,)
         user_model.save(args.output_dir, format=LoadFormat.HUGGINGFACE)
         if tokenizer is not None:
             tokenizer.save_pretrained(args.output_dir)
+        if processor is not None:
+            processor.save_pretrained(args.output_dir)
 
     if args.accuracy:
         model_name = args.model_name
@@ -494,7 +539,16 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
         torch_device = torch.device(device_str)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code,
                                                   padding_side="right", use_fast=False)
-        model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code)
+        config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
+        model_type = config.model_type
+        model_cls = None
+        if 'mllama' in model_type or 'qwen2' in model_type:
+            print(f"{model_type} quantized model evaluation is not supported yet.")
+            exit()
+        if 'qwen2' in model_type: ## TODO test the eval ability
+            from transformers import Qwen2VLForConditionalGeneration
+            model_cls = Qwen2VLForConditionalGeneration
+        model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code, model_class=model_cls)
         model = model.to(torch_device)
         datasets=args.eval_dataset.split(',')
         for dataset in datasets:
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/__init__.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/__init__.py
similarity index 100%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/__init__.py
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/__init__.py
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py
similarity index 100%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_multiple_choice.py
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_vqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py
similarity index 100%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/evaluate_vqa.py
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
similarity index 100%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/main.py
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa.py
similarity index 100%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa.py
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa.py
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa_eval.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa_eval.py
similarity index 100%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/mm_evaluation/vqa_eval.py
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/vqa_eval.py
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/requirements.txt b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/requirements.txt
similarity index 64%
rename from examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/requirements.txt
rename to examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/requirements.txt
index 2d060638bbc..a76b24c8e9e 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Qwen-VL/requirements.txt
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/requirements.txt
@@ -1,4 +1,4 @@
-transformers
+transformers==4.45.2
 torch
 tiktoken
 transformers_stream_generator
@@ -11,4 +11,7 @@ protobuf
 auto-gptq
 openpyxl
 wandb
-py-cpuinfo
\ No newline at end of file
+py-cpuinfo
+# for Qwen2-VL
+Pillow
+qwen_vl_utils
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_autoround.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_autoround.sh
new file mode 100644
index 00000000000..34cd76065b9
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_autoround.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --model_name=*)
+          model_name=$(echo $var |cut -f2 -d=)
+      ;;
+      --image_folder=*)
+          image_folder=$(echo $var |cut -f2 -d=)
+      ;;
+      --question_file=*)
+          question_file=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_dir=*)
+          output_dir=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    python main.py \
+            --model_name ${model_name} \
+            --bits 4 \
+            --group_size 128 \
+            --iters 200 \
+            --seqlen 512 \
+            --disable_quanted_input \
+            --quantize \
+            --image_folder ${image_folder} \
+            --question_file ${question_file} \
+            --output_dir ${output_dir}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh
new file mode 100644
index 00000000000..81b8181cb80
--- /dev/null
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_evaluation
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --model_name=*)
+          model_name=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_evaluation
+function run_evaluation {
+    python mm_evaluation/main.py \
+            --model_name ${model_name} \
+            --trust_remote_code \
+            --eval_bs 4
+}
+
+main "$@"
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
index 7d22c7efbc9..6a3474b2f3a 100644
--- a/neural_compressor/torch/algorithms/weight_only/save_load.py
+++ b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -55,6 +55,7 @@ def save(model, output_dir="./saved_results", format=LoadFormat.DEFAULT, **kwarg
     os.makedirs(output_dir, exist_ok=True)
     if format == LoadFormat.HUGGINGFACE:  # pragma: no cover
         config = model.config
+        config_file = "quantize_config.json"
         quantization_config = config.quantization_config if hasattr(config, "quantization_config") else None
         if "backend" in quantization_config and "auto_round" in quantization_config["backend"]:
             safe_serialization = kwargs.get("safe_serialization", True)
@@ -64,6 +65,8 @@ def save(model, output_dir="./saved_results", format=LoadFormat.DEFAULT, **kwarg
                 tokenizer.save_pretrained(output_dir)
             del model.save
             model.save_pretrained(output_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+            with open(os.path.join(output_dir, config_file), "w", encoding="utf-8") as f:
+                json.dump(quantization_config, f, indent=2)
             return
 
     qmodel_weight_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
@@ -227,8 +230,13 @@ def load_hf_format_woq_model(self):
         ):  # # pragma: no cover
             # load autoround format quantized model
             from auto_round import AutoRoundConfig
-
-            model = model_class.from_pretrained(self.model_name_or_path)
+            hf_kargs = {}
+            pretrain_args = ["trust_remote_code", "_attn_implementation", "device_map", "torch_dtype"]
+            for item in pretrain_args:
+                arg_value = self.kwargs.get(item, None)
+                if arg_value is not None:
+                    hf_kargs[item] = arg_value
+            model = model_class.from_pretrained(self.model_name_or_path, **hf_kargs)
             return model
         # get loaded state_dict
         self.loaded_state_dict = self._get_loaded_state_dict(config)
@@ -450,7 +458,12 @@ def _get_model_class_and_config(self):
                 AutoModelForCausalLM.register(config.__class__, model_class, exist_ok=True)
         elif type(config) in AutoModelForCausalLM._model_mapping.keys():
             model_class = _get_model_class(config, AutoModelForCausalLM._model_mapping)
-
+        else:
+            model_cls = self.kwargs.pop("model_class", None)
+            if model_cls:
+                model_class = model_cls
+            else:
+                logger.info(f"Could't find model class.")
         return model_class, config
 
     def _get_loaded_state_dict(self, config):
@@ -908,3 +921,4 @@ def _use_hpu_module(self):  # pragma: no cover
                 if os.path.exists(os.path.join(self._model_local_dir, HPU_WEIGHT_NAME)):
                     return True
         return False
+

From 335f29ed82bb6ecd3a61d4455291d75a2d04a465 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:12:27 +0000
Subject: [PATCH 22/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/weight_only/save_load.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
index 6a3474b2f3a..d5e6fbef235 100644
--- a/neural_compressor/torch/algorithms/weight_only/save_load.py
+++ b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -230,6 +230,7 @@ def load_hf_format_woq_model(self):
         ):  # # pragma: no cover
             # load autoround format quantized model
             from auto_round import AutoRoundConfig
+
             hf_kargs = {}
             pretrain_args = ["trust_remote_code", "_attn_implementation", "device_map", "torch_dtype"]
             for item in pretrain_args:
@@ -463,7 +464,7 @@ def _get_model_class_and_config(self):
             if model_cls:
                 model_class = model_cls
             else:
-                logger.info(f"Could't find model class.")
+                logger.info("Couldn't find model class.")
         return model_class, config
 
     def _get_loaded_state_dict(self, config):
@@ -921,4 +922,3 @@ def _use_hpu_module(self):  # pragma: no cover
                 if os.path.exists(os.path.join(self._model_local_dir, HPU_WEIGHT_NAME)):
                     return True
         return False
-

From b3bea7f78d1dcede4327911f85f99967d9bf8fbf Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Wed, 16 Oct 2024 00:34:57 +0800
Subject: [PATCH 23/33] fix ci

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 441 +++++++++---------
 1 file changed, 220 insertions(+), 221 deletions(-)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index a1f33413864..eef6f084eb9 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -1,225 +1,224 @@
 {
-    "pytorch": {
-      "llava_woq_autoround_int4":{
-        "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava",
-        "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
-        "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
-        "input_model": "liuhaotian/llava-v1.5-7b",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "qwenvl_woq_autoround_int4":{
-        "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
-        "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
-        "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
-        "input_model": "Qwen/Qwen-VL",
-        "main_script": "main.py",
-        "batch_size": 8
-      },
-      "Phi3Vision_woq_autoround_int4":{
-        "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision",
-        "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
-        "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
-        "input_model": "microsoft/Phi-3-vision-128k-instruct",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "opt_125m_woq_gptq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_woq_gptq_int4_dq_bnb":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_woq_gptq_int4_dq_ggml":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "llama2_7b_gptq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "llama2_7b_gptq_int4_dq_bnb":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "llama2_7b_gptq_int4_dq_ggml":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_rtn_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_rtn_int4_dq_bnb":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_rtn_int4_dq_ggml":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_gptq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_gptq_int4_dq_bnb":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_gptq_int4_dq_ggml":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "gpt_j_woq_awq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_woq_awq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_woq_autoround_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_woq_autotune_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "gpt_j_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "gpt_j_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "llama2_7b_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "opt_125m_ipex":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "opt_125m_ipex_sq":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 8
-      },
-      "dlrm_ipex": {
-        "model_src_dir": "recommendation/dlrm/static_quant/ipex",
-        "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
-        "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
-        "main_script": "dlrm_s_pytorch.py",
-        "batch_size": 16384
-      },
-      "resnet18_pt2e_static":{
-        "model_src_dir": "cv/static_quant",
-        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-        "input_model": "",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "resnet18_fp8_static":{
-        "model_src_dir": "cv/fp8_quant",
-        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-        "input_model": "",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "opt_125m_pt2e_static":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "run_clm_no_trainer.py",
-        "batch_size": 1
-      },
-      "sdxl_ipex_sq":{
-        "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
-        "dataset_location": "",
-        "input_model": "",
-        "main_script": "main.py",
-        "batch_size": 1
-      },
-      "resnet18_mixed_precision": {
-        "model_src_dir": "cv/mixed_precision",
-        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-        "input_model": "resnet18",
-        "main_script": "main.py",
-        "batch_size": 20
-      }
+  "pytorch": {
+    "llava_woq_autoround_int4":{
+      "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+      "input_model": "liuhaotian/llava-v1.5-7b",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "qwenvl_woq_autoround_int4":{
+      "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+      "input_model": "Qwen/Qwen-VL",
+      "main_script": "main.py",
+      "batch_size": 8
+    },
+    "Phi3Vision_woq_autoround_int4":{
+      "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision",
+      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+      "input_model": "microsoft/Phi-3-vision-128k-instruct",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_gptq_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_gptq_int4_dq_bnb":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_gptq_int4_dq_ggml":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "llama2_7b_gptq_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "llama2_7b_gptq_int4_dq_bnb":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "llama2_7b_gptq_int4_dq_ggml":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_rtn_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_rtn_int4_dq_bnb":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_rtn_int4_dq_ggml":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_gptq_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_gptq_int4_dq_bnb":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_gptq_int4_dq_ggml":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "gpt_j_woq_awq_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_awq_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_autoround_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_woq_autotune_int4":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "gpt_j_ipex":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "gpt_j_ipex_sq":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "llama2_7b_ipex":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "llama2_7b_ipex_sq":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "opt_125m_ipex":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "opt_125m_ipex_sq":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 8
+    },
+    "dlrm_ipex": {
+      "model_src_dir": "recommendation/dlrm/static_quant/ipex",
+      "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
+      "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
+      "main_script": "dlrm_s_pytorch.py",
+      "batch_size": 16384
+    },
+    "resnet18_pt2e_static":{
+      "model_src_dir": "cv/static_quant",
+      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+      "input_model": "",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "resnet18_fp8_static":{
+      "model_src_dir": "cv/fp8_quant",
+      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+      "input_model": "",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "opt_125m_pt2e_static":{
+      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "run_clm_no_trainer.py",
+      "batch_size": 1
+    },
+    "sdxl_ipex_sq":{
+      "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
+      "dataset_location": "",
+      "input_model": "",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
+    "resnet18_mixed_precision": {
+      "model_src_dir": "cv/mixed_precision",
+      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+      "input_model": "resnet18",
+      "main_script": "main.py",
+      "batch_size": 20
     }
   }
 }

From 33d49e1a3b594ea5fae8e67e65d5f366e245e93e Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao" <xuehao.sun@intel.com>
Date: Thu, 17 Oct 2024 10:07:52 +0800
Subject: [PATCH 24/33] fix EOF error

Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
---
 .../quantization/auto_round/Llava/run_eval.sh                   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
index cce4c8c5a3b..e1a37f86b8c 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
@@ -45,7 +45,7 @@ function run_evaluation {
             --eval-image-folder ${eval-image-folder} \
             --eval-annotation-file ${eval-annotation-file} \
             --eval-result-file ${eval-result-file} \
-            --trust_remote_code \
+            --trust_remote_code
 }
 
 main "$@"

From 414811aa6a4643cfde0381d4ab99a18ab9304eb7 Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao" <xuehao.sun@intel.com>
Date: Thu, 17 Oct 2024 10:29:23 +0800
Subject: [PATCH 25/33] update qwen dir

Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index eef6f084eb9..126810de5b0 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -9,7 +9,7 @@
       "batch_size": 1
     },
     "qwenvl_woq_autoround_int4":{
-      "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
+      "model_src_dir": "multimodal-modeling/quantization/auto_round/common_model",
       "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
       "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
       "input_model": "Qwen/Qwen-VL",

From 3630267375c40aec868d3090b09da0e269ad34f6 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 17 Oct 2024 14:54:22 +0800
Subject: [PATCH 26/33] refine shell, add llama3.2 inference to doc

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Llava/run_eval.sh | 29 ++++++++---------
 .../auto_round/common_model/README.md         | 31 +++++++++++++++++++
 .../auto_round/common_model/main.py           |  8 +++--
 .../auto_round/common_model/run_eval.sh       |  8 ++---
 4 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
index cce4c8c5a3b..5545d39113e 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
@@ -16,17 +16,17 @@ function init_params {
       --model_name=*)
           model_name=$(echo $var |cut -f2 -d=)
       ;;
-      --eval-question-file=*)
-          eval-question-file=$(echo $var |cut -f2 -d=)
+      --eval_question_file=*)
+          eval_question_file=$(echo $var |cut -f2 -d=)
       ;;
-      --eval-image-folder=*)
-          eval-image-folder=$(echo $var |cut -f2 -d=)
+      --eval_image_folder=*)
+          eval_image_folder=$(echo $var |cut -f2 -d=)
       ;;
-      --eval-annotation-file=*)
-          eval-annotation-file=$(echo $var |cut -f2 -d=)
+      --eval_annotation_file=*)
+          eval_annotation_file=$(echo $var |cut -f2 -d=)
       ;;
-      --eval-result-file=*)
-          eval-result-file=$(echo $var |cut -f2 -d=)
+      --eval_result_file=*)
+          eval_result_file=$(echo $var |cut -f2 -d=)
       ;;  
       *)
           echo "Error: No such parameter: ${var}"
@@ -39,13 +39,14 @@ function init_params {
 
 # run_evaluation
 function run_evaluation {
-    python mm_evaluation/textvqa.py \
+    python main.py \
+            --accuracy \
             --model_name ${model_name} \
-            --eval-question-file ${eval-question-file} \
-            --eval-image-folder ${eval-image-folder} \
-            --eval-annotation-file ${eval-annotation-file} \
-            --eval-result-file ${eval-result-file} \
-            --trust_remote_code \
+            --eval_question_file ${eval-question-file} \
+            --eval_image_folder ${eval-image-folder} \
+            --eval_annotation_file ${eval-annotation-file} \
+            --eval_result_file ${eval-result-file}
 }
 
 main "$@"
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md
index 0f60a8cd684..00b9f0ee330 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/README.md
@@ -173,6 +173,37 @@ print(output_text)
 ```
 
 
+- Llama-3.2-11B-Vision-Instruct inference
+
+```python
+import requests
+import torch
+from PIL import Image
+from transformers import MllamaForConditionalGeneration, AutoProcessor
+from neural_compressor.torch.quantization import load
+quantized_model_path="./tmp_autoround"
+model = load(quantized_model_path, format='huggingface', device_map="auto", torch_dtype=torch.bfloat16,
+             trust_remote_code=True, model_class=MllamaForConditionalGeneration)
+processor = AutoProcessor.from_pretrained(quantized_model_path)
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
+inputs = processor(image, prompt, return_tensors="pt", truncation=True).to(model.device)
+
+output = model.generate(**inputs, max_new_tokens=30)
+print(processor.decode(output[0]))
+
+# <|begin_of_text|><|image|><|begin_of_text|>If I had to write a haiku for this one, it would be:
+
+# Rabbit in a coat
+# Dressed up in style for the day
+# Country charm abounds
+
+# The image depicts a rabbit
+```
+
 
 ## 4. Results
 Using [COCO 2017](https://cocodataset.org/) and [LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) datasets for quantization calibration, and TextVQA dataset for evaluation. please follow the [recipe](./run_autoround.sh) and [evaluate script](./run_eval.sh). The results for Qwen-VL are as follows:
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
index 847be2a206b..37c4bce4df1 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
@@ -429,8 +429,12 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
         config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
         model_type = config.model_type
         if "mllama" in model_type: #for Llama-3.2-11B-Vision-Instruct
+            transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
+            if transformers_version[0] == 4 and transformers_version[1] < 45:
+                error_message = "Please upgrade transformers to version >= 4.45 or the newest source code to support Qwen2-VL quantization."
+                raise EnvironmentError(error_message)
             from transformers import MllamaForConditionalGeneration, AutoProcessor
-            model = MllamaForConditionalGeneration.from_pretrained(args.model_name, 
+            model = MllamaForConditionalGeneration.from_pretrained(args.model_name, attn_implementation="eager",
                                                                 trust_remote_code=not args.disable_trust_remote_code) # torch_dtype=torch.bfloat16
             processor = AutoProcessor.from_pretrained(args.model_name)
             tokenizer.processor = processor
@@ -438,7 +442,7 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
         elif 'qwen2' in model_type: # for Qwen2-VL-instruct
             transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
             if transformers_version[0] == 4 and transformers_version[1] < 45:
-                error_message = "Please upgrade transformers to version >= 4.45 or the newest source code to support lm-head quantization."
+                error_message = "Please upgrade transformers to version >= 4.45 or the newest source code to support Qwen2-VL quantization."
                 raise EnvironmentError(error_message)
             from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
             from qwen_vl_utils import process_vision_info, fetch_image
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh
index 81b8181cb80..7bc295f46c7 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/run_eval.sh
@@ -27,10 +27,10 @@ function init_params {
 
 # run_evaluation
 function run_evaluation {
-    python mm_evaluation/main.py \
-            --model_name ${model_name} \
-            --trust_remote_code \
-            --eval_bs 4
+    python main.py \
+        --accuracy \
+        --model_name ${model_name} \
+        --eval_bs 4
 }
 
 main "$@"

From 4190934881bb4c7d817aaf1ede31d979154340f8 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Thu, 17 Oct 2024 16:58:42 +0800
Subject: [PATCH 27/33] bugfix

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../torch/algorithms/weight_only/save_load.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
index d5e6fbef235..f0a2aaf7179 100644
--- a/neural_compressor/torch/algorithms/weight_only/save_load.py
+++ b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -230,7 +230,6 @@ def load_hf_format_woq_model(self):
         ):  # # pragma: no cover
             # load autoround format quantized model
             from auto_round import AutoRoundConfig
-
             hf_kargs = {}
             pretrain_args = ["trust_remote_code", "_attn_implementation", "device_map", "torch_dtype"]
             for item in pretrain_args:
@@ -436,10 +435,11 @@ def _get_model_class_and_config(self):
 
         config = AutoConfig.from_pretrained(self.model_name_or_path, trust_remote_code=trust_remote_code)
         # quantization_config = config.quantization_config
-
+        
         if kwarg_attn_imp is not None and config._attn_implementation != kwarg_attn_imp:  # pragma: no cover
             config._attn_implementation = kwarg_attn_imp
-
+            
+        
         has_remote_code = hasattr(config, "auto_map") and AutoModelForCausalLM.__name__ in config.auto_map
 
         has_local_code = type(config) in AutoModelForCausalLM._model_mapping.keys()
@@ -449,7 +449,11 @@ def _get_model_class_and_config(self):
             has_local_code,
             has_remote_code,
         )
-
+        
+        model_class = self.kwargs.get("model_class", None)
+        if model_class:
+            return model_class, config
+                
         if has_remote_code and trust_remote_code:  # pragma: no cover
             class_ref = config.auto_map[AutoModelForCausalLM.__name__]
             model_class = get_class_from_dynamic_module(class_ref, self.model_name_or_path, **kwargs_orig)
@@ -460,11 +464,7 @@ def _get_model_class_and_config(self):
         elif type(config) in AutoModelForCausalLM._model_mapping.keys():
             model_class = _get_model_class(config, AutoModelForCausalLM._model_mapping)
         else:
-            model_cls = self.kwargs.pop("model_class", None)
-            if model_cls:
-                model_class = model_cls
-            else:
-                logger.info("Couldn't find model class.")
+            logger.info(f"Could't find model class.")
         return model_class, config
 
     def _get_loaded_state_dict(self, config):
@@ -922,3 +922,4 @@ def _use_hpu_module(self):  # pragma: no cover
                 if os.path.exists(os.path.join(self._model_local_dir, HPU_WEIGHT_NAME)):
                     return True
         return False
+

From dd9a4beee775fd657b45f05b109fd55e9163c64c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 17 Oct 2024 09:03:11 +0000
Subject: [PATCH 28/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch/algorithms/weight_only/save_load.py       | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
index f0a2aaf7179..4f7edc995ce 100644
--- a/neural_compressor/torch/algorithms/weight_only/save_load.py
+++ b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -230,6 +230,7 @@ def load_hf_format_woq_model(self):
         ):  # # pragma: no cover
             # load autoround format quantized model
             from auto_round import AutoRoundConfig
+
             hf_kargs = {}
             pretrain_args = ["trust_remote_code", "_attn_implementation", "device_map", "torch_dtype"]
             for item in pretrain_args:
@@ -435,11 +436,10 @@ def _get_model_class_and_config(self):
 
         config = AutoConfig.from_pretrained(self.model_name_or_path, trust_remote_code=trust_remote_code)
         # quantization_config = config.quantization_config
-        
+
         if kwarg_attn_imp is not None and config._attn_implementation != kwarg_attn_imp:  # pragma: no cover
             config._attn_implementation = kwarg_attn_imp
-            
-        
+
         has_remote_code = hasattr(config, "auto_map") and AutoModelForCausalLM.__name__ in config.auto_map
 
         has_local_code = type(config) in AutoModelForCausalLM._model_mapping.keys()
@@ -449,11 +449,11 @@ def _get_model_class_and_config(self):
             has_local_code,
             has_remote_code,
         )
-        
+
         model_class = self.kwargs.get("model_class", None)
         if model_class:
             return model_class, config
-                
+
         if has_remote_code and trust_remote_code:  # pragma: no cover
             class_ref = config.auto_map[AutoModelForCausalLM.__name__]
             model_class = get_class_from_dynamic_module(class_ref, self.model_name_or_path, **kwargs_orig)
@@ -464,7 +464,7 @@ def _get_model_class_and_config(self):
         elif type(config) in AutoModelForCausalLM._model_mapping.keys():
             model_class = _get_model_class(config, AutoModelForCausalLM._model_mapping)
         else:
-            logger.info(f"Could't find model class.")
+            logger.info("Couldn't find model class.")
         return model_class, config
 
     def _get_loaded_state_dict(self, config):
@@ -922,4 +922,3 @@ def _use_hpu_module(self):  # pragma: no cover
                 if os.path.exists(os.path.join(self._model_local_dir, HPU_WEIGHT_NAME)):
                     return True
         return False
-

From d4cd3bd3d4ef6da0fbee346f5faa16702f82b14b Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 18 Oct 2024 13:28:51 +0800
Subject: [PATCH 29/33] bugfix

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Llava/main.py                 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
index b9bd77c0aef..c5f435e5191 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
@@ -216,13 +216,13 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
                             help="The dataset for quantization training. It can be a custom one.")
     
     # ================= Evaluation Related =====================
-    parser.add_argument("--eval-question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--eval_question_file", type=str, default="tables/question.jsonl")
     
-    parser.add_argument("--eval-image-folder", type=str)
+    parser.add_argument("--eval_image_folder", type=str)
     
-    parser.add_argument('--eval-result-file', type=str)
+    parser.add_argument('--eval_result_file', type=str)
     
-    parser.add_argument('--eval-annotation-file', type=str)
+    parser.add_argument('--eval_annotation_file', type=str)
 
     args = parser.parse_args()
 

From d9193cf9b7c6f3b94336456f77c1a77e2bd08d57 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 18 Oct 2024 14:27:24 +0800
Subject: [PATCH 30/33] bugfix

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/common_model/main.py              | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
index 37c4bce4df1..8f7f48065a2 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
@@ -428,6 +428,7 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
         questions = json.load(open(args.question_file, "r"))
         config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=not args.disable_trust_remote_code)
         model_type = config.model_type
+        processor = None
         if "mllama" in model_type: #for Llama-3.2-11B-Vision-Instruct
             transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
             if transformers_version[0] == 4 and transformers_version[1] < 45:
@@ -552,6 +553,9 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
         if 'qwen2' in model_type: ## TODO test the eval ability
             from transformers import Qwen2VLForConditionalGeneration
             model_cls = Qwen2VLForConditionalGeneration
+        elif 'mllama' in model_type:
+            from transformers import MllamaForConditionalGeneration, AutoProcessor
+            model_cls = MllamaForConditionalGeneration
         model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code, model_class=model_cls)
         model = model.to(torch_device)
         datasets=args.eval_dataset.split(',')

From d4b9f52d2b31573e8a9e87e946464036e34a9802 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 18 Oct 2024 14:48:01 +0800
Subject: [PATCH 31/33] refine eval shell

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/auto_round/Llava/main.py                 | 2 +-
 .../quantization/auto_round/Llava/run_eval.sh             | 8 ++------
 .../auto_round/common_model/mm_evaluation/main.py         | 3 ++-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
index c5f435e5191..ca97a8f3797 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/main.py
@@ -220,7 +220,7 @@ def create_data_loader(dataset, batch_size=1, data_collator=None):
     
     parser.add_argument("--eval_image_folder", type=str)
     
-    parser.add_argument('--eval_result_file', type=str)
+    parser.add_argument('--eval_result_file', type=str, default="./tmp_results")
     
     parser.add_argument('--eval_annotation_file', type=str)
 
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
index dbc60d6cdd4..c6c978e465d 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/Llava/run_eval.sh
@@ -24,10 +24,7 @@ function init_params {
       ;;
       --eval_annotation_file=*)
           eval_annotation_file=$(echo $var |cut -f2 -d=)
-      ;;
-      --eval_result_file=*)
-          eval_result_file=$(echo $var |cut -f2 -d=)
-      ;;  
+      ;; 
       *)
           echo "Error: No such parameter: ${var}"
           exit 1
@@ -44,8 +41,7 @@ function run_evaluation {
             --model_name ${model_name} \
             --eval_question_file ${eval_question_file} \
             --eval_image_folder ${eval_image_folder} \
-            --eval_annotation_file ${eval_annotation_file} \
-            --eval_result_file ${eval_result_file}
+            --eval_annotation_file ${eval_annotation_file}
 }
 
 main "$@"
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
index 11668d5a930..399b52b4531 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
@@ -20,7 +20,7 @@
         help="Whether to enable trust_remote_code"
     )
     parser.add_argument(
-        "--device", default="cuda:0",
+        "--device", default="cpu",
         help="PyTorch device (e.g. cpu/cuda:0/hpu) for evaluation."
     )
     parser.add_argument(
@@ -99,3 +99,4 @@
     print("cost time: ", time.time() - s)
 
 
+

From dade1f655da35343c4ca74c091f76b834559e6f0 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 18 Oct 2024 15:48:40 +0800
Subject: [PATCH 32/33] fix eval device issue

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../mm_evaluation/evaluate_multiple_choice.py        | 12 ++++++------
 .../common_model/mm_evaluation/evaluate_vqa.py       |  9 +++++----
 .../auto_round/common_model/mm_evaluation/main.py    |  4 ++--
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py
index 11c8944072e..9d802ebff13 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_multiple_choice.py
@@ -109,8 +109,8 @@ def scienceQA_evaluation(model_name, dataset_name, dataset_path=None, tokenizer=
     #     world_size=int(os.getenv('WORLD_SIZE', '1')),
     #     rank=int(os.getenv('RANK', '0')),
     # )
-
-    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    if "cuda" in device:
+        torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
     if isinstance(model_name, str):
         config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
         model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=trust_remote_code).eval()
@@ -141,14 +141,14 @@ def scienceQA_evaluation(model_name, dataset_name, dataset_path=None, tokenizer=
                 chunk_sizes) in tqdm(enumerate(dataloader)):
 
             outputs = model(
-                input_ids=input_tokens[:, :-1].cuda(),
-                attention_mask=attention_mask[:, :-1].cuda(),
+                input_ids=input_tokens[:, :-1].to(device),
+                attention_mask=attention_mask[:, :-1].to(device),
                 return_dict=True,
             )
             losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
                 0, 2, 1),
                                                        input_tokens[:,
-                                                                    1:].cuda(),
+                                                                    1:].to(device),
                                                        reduction='none')
 
             losses = losses.split(chunk_sizes, dim=0)
@@ -213,4 +213,4 @@ def scienceQA_evaluation(model_name, dataset_name, dataset_path=None, tokenizer=
     )
     print("cost time: ", time.time() - s)
 
-    
\ No newline at end of file
+    
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py
index a6192af4d5a..e055e71c63b 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/evaluate_vqa.py
@@ -260,8 +260,8 @@ def textVQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", data
     #     world_size=int(os.getenv('WORLD_SIZE', '1')),
     #     rank=int(os.getenv('RANK', '0')),
     # )
-
-    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    if "cuda" in device:
+        torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
     if isinstance(model_name, str):
         config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
         model = AutoModelForCausalLM.from_pretrained(model_name, config=config, trust_remote_code=trust_remote_code).eval()
@@ -303,8 +303,8 @@ def textVQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", data
     for _, (question_ids, input_ids, attention_mask,
             annotations) in tqdm(enumerate(dataloader)):
         pred = model.generate(
-            input_ids=input_ids.cuda(),
-            attention_mask=attention_mask.cuda(),
+            input_ids=input_ids.to(device),
+            attention_mask=attention_mask.to(device),
             do_sample=False,
             num_beams=1,
             max_new_tokens=ds_collections[dataset_name]['max_new_tokens'],
@@ -462,3 +462,4 @@ def textVQA_evaluation(model_name, dataset_name, base_model="Qwen/Qwen-VL", data
     print("cost time: ", time.time() - s)
 
 
+
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
index 399b52b4531..e767a3a3dc1 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
@@ -73,7 +73,7 @@
     for dataset in test_tasks:
         if 'vqa' in dataset:
             from evaluate_vqa import textVQA_evaluation
-            with torch.cuda.amp.autocast():
+            with torch.amp.autocast(device_type=device.split(":")[0], dtype=torch_dtype):
                 evaluator = textVQA_evaluation(
                     model,
                     dataset_name=dataset,
@@ -85,7 +85,7 @@
                 )
         elif 'scienceqa' in dataset:
             from evaluate_multiple_choice import scienceQA_evaluation
-            with torch.cuda.amp.autocast():
+            with torch.amp.autocast(device_type=device.split(":")[0], dtype=torch_dtype):
                 evaluator = scienceQA_evaluation(
                     model,
                     dataset_name=dataset,

From 749812c51bbced9fb68c36ca8be135cb3e1dbae9 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 18 Oct 2024 16:29:39 +0800
Subject: [PATCH 33/33] refine eval dtype

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../auto_round/common_model/main.py           | 32 +++++++++++--------
 .../common_model/mm_evaluation/main.py        |  6 ++--
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
index 8f7f48065a2..91a5b948539 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/main.py
@@ -539,6 +539,7 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
             processor.save_pretrained(args.output_dir)
 
     if args.accuracy:
+        torch_dtype = "auto"
         model_name = args.model_name
         device_str = detect_device(args.device)
         torch_device = torch.device(device_str)
@@ -558,26 +559,29 @@ def get_train_dataloader(train_dataset, model, data_collator=default_data_collat
             model_cls = MllamaForConditionalGeneration
         model = load(args.model_name, format='huggingface', trust_remote_code=not args.disable_trust_remote_code, model_class=model_cls)
         model = model.to(torch_device)
+        torch_dtype = model.dtype
         datasets=args.eval_dataset.split(',')
         for dataset in datasets:
             if 'vqa' in dataset:
                 from mm_evaluation.evaluate_vqa import textVQA_evaluation
-                evaluator = textVQA_evaluation(
-                    model,
-                    dataset_name=dataset,
-                    tokenizer=tokenizer,
-                    batch_size=args.eval_bs,
-                    device=str(torch_device)
-                )
+                with torch.amp.autocast(device_type=device_str.split(":")[0], dtype=torch_dtype):
+                    evaluator = textVQA_evaluation(
+                        model,
+                        dataset_name=dataset,
+                        tokenizer=tokenizer,
+                        batch_size=args.eval_bs,
+                        device=str(torch_device)
+                    )
             elif 'scienceqa' in dataset:
                 from mm_evaluation.evaluate_multiple_choice import scienceQA_evaluation
-                evaluator = scienceQA_evaluation(
-                    model,
-                    dataset_name=dataset,
-                    tokenizer=tokenizer,
-                    batch_size=args.eval_bs,
-                    device=str(torch_device)
-                )
+                with torch.amp.autocast(device_type=device_str.split(":")[0], dtype=torch_dtype):
+                    evaluator = scienceQA_evaluation(
+                        model,
+                        dataset_name=dataset,
+                        tokenizer=tokenizer,
+                        batch_size=args.eval_bs,
+                        device=str(torch_device)
+                    )
 
 
 
diff --git a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
index e767a3a3dc1..bbd6344921c 100644
--- a/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
+++ b/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/common_model/mm_evaluation/main.py
@@ -69,7 +69,7 @@
     test_tasks = args.tasks
     if isinstance(test_tasks, str):
         test_tasks = test_tasks.split(',')
-
+    device = args.device
     for dataset in test_tasks:
         if 'vqa' in dataset:
             from evaluate_vqa import textVQA_evaluation
@@ -81,7 +81,7 @@
                     tokenizer=tokenizer,
                     batch_size=args.eval_bs,
                     trust_remote_code=args.trust_remote_code,
-                    device=str(args.device)
+                    device=str(device)
                 )
         elif 'scienceqa' in dataset:
             from evaluate_multiple_choice import scienceQA_evaluation
@@ -93,7 +93,7 @@
                     tokenizer=tokenizer,
                     batch_size=args.eval_bs,
                     trust_remote_code=args.trust_remote_code,
-                    device=str(args.device)
+                    device=str(device)
                 )
 
     print("cost time: ", time.time() - s)