intel · XuehaoSun · Oct 16, 2024 · Oct 9, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py
@@ -197,7 +197,7 @@ def replace_architectures(json_path):
         json.dump(data, file, indent=4)
 
 def eval_func(model):
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
 
     model_dir = model
     if isinstance(model, str) and model.endswith(".onnx"):

diff --git a/...nxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt b/...nxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt
@@ -7,6 +7,7 @@ onnxruntime-extensions; python_version < '3.11'
 datasets
 optimum
 evaluate
-intel-extension-for-transformers >= 1.4.1
 peft
-lm-eval==0.4.2
+lm-eval==0.4.3
+numba
+pydantic
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
@@ -134,7 +134,7 @@ def replace_architectures(json_path):
         json.dump(data, file, indent=4)
 
 def eval_func(model):
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
 
     model_dir = model
     if isinstance(model, str) and model.endswith(".onnx"):

diff --git a/...xrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt b/...xrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt
@@ -7,6 +7,7 @@ onnxruntime-extensions; python_version < '3.11'
 datasets
 optimum
 evaluate
-intel-extension-for-transformers >= 1.4.1
 peft
-lm-eval==0.4.2
+lm-eval==0.4.3
+numba
+pydantic
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt
@@ -1,7 +1,6 @@
 accelerate
 datasets
 einops
-intel-extension-for-transformers
 optimum
 peft
 sentencepiece
@@ -10,4 +9,6 @@ torch
 tqdm
 tiktoken
 transformers_stream_generator
-lm_eval==0.4.2
+lm_eval==0.4.3
+numba
+pydantic
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py
@@ -588,7 +588,7 @@ def group_texts(examples):
     eval_batch = args.per_device_eval_batch_size
     user_model = None if args.use_accelerate else model
 
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
     eval_args = LMEvalParser(
         model="hf", 
         user_model=user_model,

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt
@@ -8,6 +8,6 @@ transformers
 pytest
 wandb
 einops
-neural-compressor
-intel-extension-for-transformers
-lm_eval==0.4.2
+lm_eval==0.4.3
+numba
+pydantic
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh
@@ -89,10 +89,10 @@ function run_benchmark {
         extra_cmd=$extra_cmd" --woq_algo TEQ"
     elif [ "${topology}" = "opt_125m_ipex" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --ipex --int8_bf16_mixed"
+        extra_cmd=$extra_cmd" --ipex"
     elif [ "${topology}" = "opt_125m_ipex_sq" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --ipex --int8_bf16_mixed --sq --alpha 0.5"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"
     elif [ "${topology}" = "bloom_560m_ipex_sq" ]; then
         model_name_or_path="bigscience/bloom-560m"
         extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"

diff --git a/...s/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/...s/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -343,7 +343,10 @@ def eval_func(model):
 
     if args.ipex:
         user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+        from transformers import AutoTokenizer, AutoConfig
         tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
+        config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
+        setattr(user_model, "config", config)
     else:
         user_model, tokenizer = get_user_model()
         kwargs = {'weight_only': True} if args.approach == 'weight_only' else {}
@@ -354,7 +357,7 @@ def eval_func(model):
 if args.accuracy:
     user_model.eval()
     if args.code_generation:
-        from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate
+        from neural_compressor.evaluation.bigcode_eval import evaluate
         from transformers import AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
         results = evaluate(
@@ -370,7 +373,7 @@ def eval_func(model):
             else:
                 acc = results["results"][task_name]["acc"]
     else:
-        from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+        from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
         eval_args = LMEvalParser(
             model="hf", 
             user_model=user_model,
@@ -395,7 +398,7 @@ def eval_func(model):
     samples = args.iters * args.batch_size
 
     if args.code_generation:
-        from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate
+        from neural_compressor.evaluation.bigcode_eval import evaluate
         from transformers import AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
         start = time.time()
@@ -413,7 +416,7 @@ def eval_func(model):
             else:
                 acc = results["results"][task_name]["acc"]
     else:
-        from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+        from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
         eval_args = LMEvalParser(
             model="hf", 
             user_model=user_model,

diff --git a/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_quant.sh b/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_quant.sh
@@ -45,7 +45,8 @@ function run_tuning {
             --tune \
             --batch_size $batch_size \
             --output_dir ${output_model} \
-	    --cache_dir ${dataset_location}
+	          --cache_dir ${dataset_location} \
+            --trust_remote_code
 
 }
 

diff --git a/...pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_whisper_large.py b/...pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_whisper_large.py
@@ -24,13 +24,15 @@
                     help='the folder path to save the results.')
 parser.add_argument('--cache_dir', default=None, type=str,
                     help='the folder path to save the results.')
+parser.add_argument("--trust_remote_code", action="store_true")
 
 args = parser.parse_args()
 model_name = 'openai/whisper-large'
 processor = WhisperProcessor.from_pretrained(model_name)
 model = WhisperForConditionalGeneration.from_pretrained(model_name)
 # dataset
-librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test", cache_dir=args.cache_dir)
+librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test", cache_dir=args.cache_dir, 
+                                      trust_remote_code=args.trust_remote_code)
 
 # metric
 wer = load("wer")

diff --git a/neural_compressor/adaptor/torch_utils/awq.py b/neural_compressor/adaptor/torch_utils/awq.py
@@ -454,6 +454,9 @@ def block_inference(self, model):
         """
         total_out = []
         for args, kwargs in zip(self.total_block_args, self.total_block_kwargs):
+            # to avoid layer_past: Dynamic_cache when transformers higher than 4.45.1
+            if "layer_past" in kwargs.keys() and kwargs["layer_past"] is not None:
+                kwargs["layer_past"] = None
             out = model(*args, **kwargs)
             if isinstance(out, tuple):  # pragma: no cover
                 out = out[0]

diff --git a/neural_compressor/adaptor/torch_utils/bf16_convert.py b/neural_compressor/adaptor/torch_utils/bf16_convert.py
@@ -19,6 +19,7 @@
 import torch.nn as nn
 
 from ...utils import logger
+from .util import append_attr
 
 
 class BF16ModuleWrapper(nn.Module):
@@ -62,9 +63,10 @@ def Convert(model, tune_cfg):
 def _bf16_wrapper_model(model, bf16_ops_list, prefix=""):
     for name, child in model.named_children():
         op_name = prefix + "." + name if prefix != "" else name
+        _bf16_wrapper_model(child, bf16_ops_list, op_name)
         for bf16_op_name in bf16_ops_list:
             if op_name == bf16_op_name[0] or op_name == bf16_op_name[0].split(".module")[0]:
-                child = BF16ModuleWrapper(child)
-                setattr(model, name, child)
-        _bf16_wrapper_model(child, bf16_ops_list, op_name)
+                child_bf16 = BF16ModuleWrapper(child)
+                append_attr(child_bf16, child)
+                setattr(model, name, child_bf16)
     return model
diff --git a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2x.py b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2x.py
@@ -401,8 +401,13 @@ def test_mix_precision(self):
         ptq_fx_op_name_list["conv.*"] = {"weight": {"dtype": "bf16"}, "activation": {"dtype": "bf16"}}
         conf = PostTrainingQuantConfig(op_name_dict=ptq_fx_op_name_list)
         q_model = quantization.fit(model_origin, conf, calib_dataloader=dataloader, calib_func=eval_func)
-        self.assertEqual(q_model._model.conv.module.module.weight.dtype, torch.bfloat16)
-        self.assertEqual(q_model._model.conv.module.module.bias.dtype, torch.bfloat16)
+        self.assertEqual(q_model._model.conv.module.weight.dtype, torch.bfloat16)
+        self.assertEqual(q_model._model.conv.module.bias.dtype, torch.bfloat16)
+        self.assertEqual(
+            q_model._model.conv.stride[0],
+            1,
+            msg="GraphModule object should have the attributes of the original module.",
+        )
 
     def test_hawq_metric(self):
         # Test for hawq metric