@@ -67,7 +67,7 @@ def initialize_model_and_tokenizer(model_name_or_path):
67
67
parser .add_argument (
68
68
"--model_name_or_path" , type = str , default = "meta-llama/Meta-Llama-3.1-8B-Instruct" , help = "model name or path"
69
69
)
70
- parser .add_argument ("--dtype" , type = str , default = "mx_fp4 " , choices = ["mx_fp4 " , "mx_fp8 " , "nv_fp2 " , "fp4_v2 " ], help = "data type" )
70
+ parser .add_argument ("--dtype" , type = str , default = "MXFP4 " , choices = ["MXFP4 " , "MXFP8 " , "NVFP4 " , "NVFP4+" , "uNVFP4 " ], help = "data type" )
71
71
parser .add_argument ("--quantize" , action = "store_true" , help = "whether to quantize model" )
72
72
parser .add_argument ("--use_recipe" , action = "store_true" , help = "whether to use recipe to quantize model" )
73
73
parser .add_argument ("--recipe_file" , type = str , default = "recipes/Meta-Llama-3.1-8B-Instruct_6bits.json" , help = "path of recipe file" )
@@ -80,13 +80,6 @@ def initialize_model_and_tokenizer(model_name_or_path):
80
80
parser .add_argument ("--accuracy" , action = "store_true" , help = "accuracy measurement" )
81
81
parser .add_argument ("--local_rank" , type = int , default = 0 , metavar = "N" , help = "Local process rank." )
82
82
parser .add_argument ("--batch_size" , default = 32 , type = int , help = "batch size for accuracy evaluation." )
83
- parser .add_argument (
84
- "--mxfp8_mod_list" ,
85
- type = str ,
86
- nargs = "*" ,
87
- default = [], # 默认值
88
- help = "List of module names or patterns for MXFP8 quantization." ,
89
- )
90
83
parser .add_argument (
91
84
"--tasks" ,
92
85
type = str ,
@@ -109,6 +102,14 @@ def initialize_model_and_tokenizer(model_name_or_path):
109
102
device = "hpu" if is_hpex_available () else "cuda"
110
103
111
104
if args .quantize :
105
+ autoround_dtype_mapping = {
106
+ "MXFP4" : "mx_fp4" ,
107
+ "MXFP8" : "mx_fp8" ,
108
+ "NVFP4" : "nv_fp4" ,
109
+ "uNVFP4" : "fp4_v2" ,
110
+ "NVFP4+" : "fp4_v2" ,
111
+ }
112
+ args .dtype = autoround_dtype_mapping [args .dtype ]
112
113
if args .quant_lm_head :
113
114
lm_head_config = {
114
115
"group_size" : 32 if "mx" in args .dtype else 16 ,
@@ -155,11 +156,10 @@ def load_recipe_results(file_path):
155
156
autoround .quantize ()
156
157
model = autoround .model
157
158
158
- # set dtype to BF16 for HPU inference performance
159
- model = model .to (torch .bfloat16 )
160
- model = model .eval ().to (device )
161
-
162
159
if args .accuracy :
160
+ # set dtype to BF16 for HPU inference performance
161
+ model = model .to (torch .bfloat16 )
162
+ model = model .eval ().to (device )
163
163
if is_hpex_available ():
164
164
# HPU needs padding to buckets for better performance
165
165
# Generation tasks, such as gsm8k and mmlu-pro, may get OOM.
@@ -240,3 +240,12 @@ def load_recipe_results(file_path):
240
240
for task_name , accu in all_accuracy .items ():
241
241
print (f"Accuracy for { task_name } : { accu :.4f} " )
242
242
print (f"Overall accuracy: { sum (all_accuracy .values ())/ len (all_accuracy ):.4f} " )
243
+
244
+ if args .save :
245
+ if world_size > 1 :
246
+ assert False , "model quantized with deepspeed tensor parallel is not supported to be saved."
247
+ elif args .use_recipe :
248
+ assert False , "model quantized with recipe is not supported to be saved."
249
+ else :
250
+ autoround .save_quantized (args .save_path , format = "llm_compressor" )
251
+ print (f"Quantized model is saved to { args .save_path } " )
0 commit comments