@@ -128,6 +128,7 @@ def is_model_on_gpu(model) -> bool:
128128 """Returns if the model is fully loaded on GPUs."""
129129 return all ("cuda" in str (param .device ) for param in model .parameters ())
130130
131+
131132def get_tokenizer (ckpt_path , trust_remote_code = False ):
132133 """Returns the tokenizer from the model ckpt_path."""
133134 print (f"Initializing tokenizer from { ckpt_path } " )
@@ -143,6 +144,7 @@ def get_tokenizer(ckpt_path, trust_remote_code=False):
143144
144145 return tokenizer
145146
147+
146148def quantize_and_export_model (
147149 args : argparse .Namespace ,
148150):
@@ -188,7 +190,7 @@ def quantize_and_export_model(
188190 else :
189191 print ("Model is already quantized, Skipping quantization..." )
190192 quantized_model = model
191-
193+
192194 mtq .print_quant_summary (quantized_model )
193195 if not model_is_already_quantized :
194196 print ("--------" )
@@ -199,11 +201,6 @@ def quantize_and_export_model(
199201 print (f"example outputs after ptq: { generated_str_after_ptq } " )
200202
201203 export_hf_vllm_fq_checkpoint (quantized_model , args .export_path )
202- # from modelopt.torch.quantization.utils import get_quantizer_state_dict
203- # quantized_model.save_pretrained(args.export_path, state_dict=quantized_model.state_dict(), save_modelopt_state=False)
204- # modelopt_state = mto.modelopt_state(quantized_model)
205- # modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(quantized_model)
206- # torch.save(modelopt_state, f"{args.export_path}/modelopt_state.pth")
207204 tokenizer .save_pretrained (args .export_path )
208205 print (f"Model exported to { args .export_path } " )
209206
0 commit comments