update readme and fix CI

xin3he · xin3he · commit 3ffb650cb8ac · 2025-11-27T21:27:05.000-05:00
Signed-off-by: He, Xin3 &lt;xin3.he@intel.com&gt;
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -8,7 +8,7 @@ In this example, you can verify the accuracy on HPU/CUDA device with emulation o
 # neural-compressor-pt
 pip install neural-compressor-pt==3.7
 # auto-round
-pip install auto-round==0.9.1
+pip install auto-round==0.9.2
 # other requirements
 pip install -r requirements.txt
 ```
@@ -19,7 +19,7 @@ pip install -r requirements.txt
 # neural-compressor-pt
 INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master
 # auto-round
-pip install git+https://github.com/intel/auto-round.git@main
+pip install git+https://github.com/intel/auto-round.git@more-ar-ext
 # other requirements
 pip install -r requirements.txt
 ```
@@ -44,7 +44,7 @@ CUDA_VISIBLE_DEVICES=0 python quantize.py  \
 ```
 
 Notes:
-- Use `--export_format auto_round` for `MXFP4`, `MXFP8` data type and do inference as [below](#mxfp4--mxfp8)
+- Use `--export_format auto_round` for `MXFP4`, `MXFP8` data type and do inference as below.
 - Use `--export_format llm_compressor` for `NVFP4` data type since public vLLM supports it.
 - Use `--export_format fake` for `uNVFP4` data type since it's not fully supported.
 - Setting `--quant_lm_head` applies `--dtype` for the lm_head layer.
@@ -87,7 +87,6 @@ AutoRound helps improve the accuracy, `iters` and `nsamples` is higher than defa
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-8B --dtype=mxfp8 --input_model=/models/Meta-Llama-3.1-8B-Instruct --output_model=Llama-3.1-8B-MXFP8
 ```
 
-
 #### Llama 3.1 8B MXFP4 (Mixed with MXFP8, Target_bits=7.8)
 
 ```bash
@@ -119,7 +118,7 @@ Note: If you got OOM issue, either increasing `CUDA_VISIBLE_DEVICES` or reducing
 
 ## Inference
 
-### MXFP4 / MXFP8
+### MXFP4 & MXFP8
 
 - Both pure MXFP4/MXFP8 and mix-precision model generated by target bits are supported.
 
diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py
@@ -190,6 +190,7 @@ class BaseConfig(ABC):
     name = BASE_CONFIG
     params_list = []
     _is_initialized = False
+    non_tunable_params = ["white_list"]
 
     def __init__(self, white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST) -> None:
         """Initialize the BaseConfig.
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -370,21 +370,18 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             model = rounder.model
             model.autoround_config = rounder.layer_config
 
+        self.accelerator.empty_cache()
         dump_model_op_stats(rounder.layer_config)
 
         if self.export_format in ["auto_round", "llm_compressor"]:
             # the directly returned model is QuantLinear, which is used for packing.
             try:
-                del model
-                self.accelerator.empty_cache()
                 logger.info(f"Quantization is done, reloading model from saved directory({self.output_dir})...")
                 import transformers  # pylint: disable=E0401
 
                 model = transformers.AutoModelForCausalLM.from_pretrained(self.output_dir)
             except:
                 pass
-        else:
-            self.accelerator.empty_cache()
 
         return model