diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py index 29e8653bfab..3da42f9d9d9 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py @@ -197,7 +197,7 @@ def replace_architectures(json_path): json.dump(data, file, indent=4) def eval_func(model): - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser model_dir = model if isinstance(model, str) and model.endswith(".onnx"): diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt index 8279cc72722..fbd60f42f23 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt @@ -7,6 +7,7 @@ onnxruntime-extensions; python_version < '3.11' datasets optimum evaluate -intel-extension-for-transformers >= 1.4.1 peft -lm-eval==0.4.2 +lm-eval==0.4.3 +numba +pydantic diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index e43ceecefe7..a5860c14c24 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -134,7 +134,7 @@ def replace_architectures(json_path): json.dump(data, file, indent=4) def eval_func(model): - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser model_dir = model if isinstance(model, str) and model.endswith(".onnx"): diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt index 8279cc72722..fbd60f42f23 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt @@ -7,6 +7,7 @@ onnxruntime-extensions; python_version < '3.11' datasets optimum evaluate -intel-extension-for-transformers >= 1.4.1 peft -lm-eval==0.4.2 +lm-eval==0.4.3 +numba +pydantic diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt index e129cb6dc91..9fee9d0543d 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt @@ -1,7 +1,6 @@ accelerate datasets einops -intel-extension-for-transformers optimum peft sentencepiece @@ -10,4 +9,6 @@ torch tqdm tiktoken transformers_stream_generator -lm_eval==0.4.2 +lm_eval==0.4.3 +numba +pydantic diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py index 5b34ae79382..49e53b5000e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py @@ -588,7 +588,7 @@ def group_texts(examples): eval_batch = args.per_device_eval_batch_size user_model = None if args.use_accelerate else model - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", user_model=user_model, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt index fe73842a104..1f70fea933b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt @@ -8,6 +8,6 @@ transformers pytest wandb einops -neural-compressor -intel-extension-for-transformers -lm_eval==0.4.2 +lm_eval==0.4.3 +numba +pydantic diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh index 0277a26c79c..c9461f822a8 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_benchmark.sh @@ -89,10 +89,10 @@ function run_benchmark { extra_cmd=$extra_cmd" --woq_algo TEQ" elif [ "${topology}" = "opt_125m_ipex" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --ipex --int8_bf16_mixed" + extra_cmd=$extra_cmd" --ipex" elif [ "${topology}" = "opt_125m_ipex_sq" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --ipex --int8_bf16_mixed --sq --alpha 0.5" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" elif [ "${topology}" = "bloom_560m_ipex_sq" ]; then model_name_or_path="bigscience/bloom-560m" extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 2407840c381..22b5ec453f2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -343,7 +343,10 @@ def eval_func(model): if args.ipex: user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) + from transformers import AutoTokenizer, AutoConfig tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) + config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) + setattr(user_model, "config", config) else: user_model, tokenizer = get_user_model() kwargs = {'weight_only': True} if args.approach == 'weight_only' else {} @@ -354,7 +357,7 @@ def eval_func(model): if args.accuracy: user_model.eval() if args.code_generation: - from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate + from neural_compressor.evaluation.bigcode_eval import evaluate from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) results = evaluate( @@ -370,7 +373,7 @@ def eval_func(model): else: acc = results["results"][task_name]["acc"] else: - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", user_model=user_model, @@ -395,7 +398,7 @@ def eval_func(model): samples = args.iters * args.batch_size if args.code_generation: - from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate + from neural_compressor.evaluation.bigcode_eval import evaluate from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) start = time.time() @@ -413,7 +416,7 @@ def eval_func(model): else: acc = results["results"][task_name]["acc"] else: - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", user_model=user_model, diff --git a/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_quant.sh b/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_quant.sh index 1db9e613cd4..1ac159cad17 100755 --- a/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_quant.sh +++ b/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_quant.sh @@ -45,7 +45,8 @@ function run_tuning { --tune \ --batch_size $batch_size \ --output_dir ${output_model} \ - --cache_dir ${dataset_location} + --cache_dir ${dataset_location} \ + --trust_remote_code } diff --git a/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_whisper_large.py b/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_whisper_large.py index fbf9b53d81f..19b93a63837 100755 --- a/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_whisper_large.py +++ b/examples/pytorch/speech_recognition/whisper_large/quantization/ptq_dynamic/fx/run_whisper_large.py @@ -24,13 +24,15 @@ help='the folder path to save the results.') parser.add_argument('--cache_dir', default=None, type=str, help='the folder path to save the results.') +parser.add_argument("--trust_remote_code", action="store_true") args = parser.parse_args() model_name = 'openai/whisper-large' processor = WhisperProcessor.from_pretrained(model_name) model = WhisperForConditionalGeneration.from_pretrained(model_name) # dataset -librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test", cache_dir=args.cache_dir) +librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test", cache_dir=args.cache_dir, + trust_remote_code=args.trust_remote_code) # metric wer = load("wer") diff --git a/neural_compressor/adaptor/torch_utils/awq.py b/neural_compressor/adaptor/torch_utils/awq.py index 35c35624745..a745b4ec21a 100644 --- a/neural_compressor/adaptor/torch_utils/awq.py +++ b/neural_compressor/adaptor/torch_utils/awq.py @@ -454,6 +454,9 @@ def block_inference(self, model): """ total_out = [] for args, kwargs in zip(self.total_block_args, self.total_block_kwargs): + # to avoid layer_past: Dynamic_cache when transformers higher than 4.45.1 + if "layer_past" in kwargs.keys() and kwargs["layer_past"] is not None: + kwargs["layer_past"] = None out = model(*args, **kwargs) if isinstance(out, tuple): # pragma: no cover out = out[0] diff --git a/neural_compressor/adaptor/torch_utils/bf16_convert.py b/neural_compressor/adaptor/torch_utils/bf16_convert.py index b6d5e6d01bd..8c55cbebf94 100644 --- a/neural_compressor/adaptor/torch_utils/bf16_convert.py +++ b/neural_compressor/adaptor/torch_utils/bf16_convert.py @@ -19,6 +19,7 @@ import torch.nn as nn from ...utils import logger +from .util import append_attr class BF16ModuleWrapper(nn.Module): @@ -62,9 +63,10 @@ def Convert(model, tune_cfg): def _bf16_wrapper_model(model, bf16_ops_list, prefix=""): for name, child in model.named_children(): op_name = prefix + "." + name if prefix != "" else name + _bf16_wrapper_model(child, bf16_ops_list, op_name) for bf16_op_name in bf16_ops_list: if op_name == bf16_op_name[0] or op_name == bf16_op_name[0].split(".module")[0]: - child = BF16ModuleWrapper(child) - setattr(model, name, child) - _bf16_wrapper_model(child, bf16_ops_list, op_name) + child_bf16 = BF16ModuleWrapper(child) + append_attr(child_bf16, child) + setattr(model, name, child_bf16) return model diff --git a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2x.py b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2x.py index 1bfa38a0bb7..24bbc47c888 100644 --- a/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2x.py +++ b/test/adaptor/pytorch_adaptor/test_adaptor_pytorch_2x.py @@ -401,8 +401,13 @@ def test_mix_precision(self): ptq_fx_op_name_list["conv.*"] = {"weight": {"dtype": "bf16"}, "activation": {"dtype": "bf16"}} conf = PostTrainingQuantConfig(op_name_dict=ptq_fx_op_name_list) q_model = quantization.fit(model_origin, conf, calib_dataloader=dataloader, calib_func=eval_func) - self.assertEqual(q_model._model.conv.module.module.weight.dtype, torch.bfloat16) - self.assertEqual(q_model._model.conv.module.module.bias.dtype, torch.bfloat16) + self.assertEqual(q_model._model.conv.module.weight.dtype, torch.bfloat16) + self.assertEqual(q_model._model.conv.module.bias.dtype, torch.bfloat16) + self.assertEqual( + q_model._model.conv.stride[0], + 1, + msg="GraphModule object should have the attributes of the original module.", + ) def test_hawq_metric(self): # Test for hawq metric