1010from ..utils .logger import logger
1111from ..utils .memory_tracker import memory_tracker
1212import time
13+ from tqdm .auto import tqdm
1314
1415try :
1516 import ctransformers
17+ from ctransformers import AutoModelForCausalLM as CTAutoModel
1618 CT_AVAILABLE = True
1719except ImportError :
1820 CT_AVAILABLE = False
@@ -149,14 +151,18 @@ def quantize(
149151 for i in range (0 , total_layers , self .chunk_size )]
150152
151153 start_time = time .perf_counter ()
152- for chunk_idx , chunk in enumerate (chunks ):
153- logger .log_info (f"\n Processing chunk { chunk_idx + 1 } /{ len (chunks )} " )
154+
155+ # Create progress bar for chunks
156+ chunk_pbar = tqdm (chunks , desc = "Processing chunks" , position = 0 )
157+ layer_pbar = tqdm (total = total_layers , desc = "Quantizing layers" , position = 1 , leave = True )
158+
159+ for chunk_idx , chunk in enumerate (chunk_pbar ):
160+ chunk_pbar .set_description (f"Processing chunk { chunk_idx + 1 } /{ len (chunks )} " )
154161
155162 for idx , (name , module ) in enumerate (chunk , 1 ):
156163 try :
157164 current_layer = idx + chunk_idx * self .chunk_size
158- logger .log_info (f"\n Quantizing layer { current_layer } /{ total_layers } : { name } " )
159- logger .log_info (f"Layer shape: { list (module .weight .shape )} " )
165+ layer_pbar .set_description (f"Layer { current_layer } /{ total_layers } : { name } " )
160166
161167 # Move layer to target device if needed
162168 if module .weight .device != device :
@@ -174,11 +180,11 @@ def quantize(
174180 else :
175181 setattr (self .model , name , quantized_layer )
176182
177- # Log progress
183+ # Update progress
184+ layer_pbar .update (1 )
178185 elapsed_time = time .perf_counter () - start_time
179- progress = current_layer / total_layers
180- eta = elapsed_time / progress - elapsed_time if progress > 0 else 0
181- logger .log_info (f"Progress: { progress * 100 :.1f} % | ETA: { eta :.1f} s" )
186+ eta = elapsed_time / (current_layer / total_layers ) - elapsed_time if current_layer > 0 else 0
187+ layer_pbar .set_postfix ({"ETA" : f"{ eta :.1f} s" })
182188
183189 self ._clear_memory ()
184190
@@ -191,6 +197,10 @@ def quantize(
191197 torch .cuda .empty_cache ()
192198 gc .collect ()
193199
200+ # Close progress bars
201+ layer_pbar .close ()
202+ chunk_pbar .close ()
203+
194204 # Log final statistics
195205 total_time = time .perf_counter () - start_time
196206 logger .log_info ("\n " + "=" * 60 )
@@ -322,28 +332,31 @@ def convert_to_gguf(self, output_path: str):
322332 self .model .to ('cpu' )
323333 memory_tracker .log_memory ("model_moved_to_cpu" )
324334
325- # Prepare GGUF conversion config
326- config = {
327- "quantization" : {
328- "bits" : self .bits ,
329- "type" : self .quant_type ,
330- "group_size" : self .group_size if self .group_size > 0 else None ,
331- },
332- "metadata" : {
333- "description" : "Model quantized using QuantLLM GGUF quantizer" ,
334- "format_version" : "legacy" if self .legacy_format else "latest" ,
335- "has_act_desc" : self .desc_act ,
336- "has_tensor_desc" : self .desc_ten
337- }
338- }
335+ # Save model in HF format first
336+ temp_dir = f"{ output_path } _temp_hf"
337+ self .model .save_pretrained (temp_dir )
339338
340339 # Convert using ctransformers
341- ctransformers .convert (
342- self .model ,
343- output_path ,
344- config = config ,
345- legacy = self .legacy_format
346- )
340+ try :
341+ # Use ctransformers to load and save in GGUF format
342+ ct_model = CTAutoModel .from_pretrained (
343+ temp_dir ,
344+ model_type = "llama" , # Default to llama, can be parameterized later
345+ model_file = None ,
346+ config = {
347+ "max_new_tokens" : 2048 ,
348+ "context_length" : 2048 ,
349+ "gpu_layers" : 0 # CPU conversion
350+ }
351+ )
352+ ct_model .save_pretrained (output_path )
353+
354+ import shutil
355+ shutil .rmtree (temp_dir , ignore_errors = True )
356+
357+ except Exception as e :
358+ logger .log_error (f"CTTransformers conversion failed: { str (e )} " )
359+ raise
347360
348361 memory_tracker .log_memory ("gguf_conversion_complete" )
349362 logger .log_info ("GGUF conversion completed successfully" )
@@ -362,3 +375,4 @@ def _clear_memory(self):
362375 torch .cuda .synchronize ()
363376 memory_tracker .clear_memory ()
364377
378+
0 commit comments