[NPU] Fix save-load usage of minicpm models (#12628)

intel-analytics · Dec 27, 2024 · f17ccfa · f17ccfa
1 parent c72a5db
commit f17ccfa
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 5 deletions.
diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -445,12 +445,9 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
             from .npu_models.npu_llm_cpp import load_model_from_file
             from .npu_models.convert import generate, general_convert
             from .npu_models.convert import prepare_input_ids, causal_lm_forward
-            config = AutoConfig.from_pretrained(
-                os.path.join(pretrained_model_name_or_path, "config.json"),
-                trust_remote_code=trust_remote_code)
             with torch.device('meta'):
-                model = cls.HF_Model.from_config(
-                    config, trust_remote_code=trust_remote_code)
+                model = cls.HF_Model.from_config(config,
+                                                 trust_remote_code=trust_remote_code)
             try:
                 model_ptr = load_model_from_file(pretrained_model_name_or_path)
                 model.model_ptr = model_ptr

diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
@@ -444,6 +444,11 @@ def convert_llm_for_deploy(model: torch.nn.Module,
     else:
         lm_head_low_bit = model.lm_head.lm_heads[0].qtype
 
+    if model._auto_class is not None:
+        # For a custom model, copy the file defining it in the folder
+        from transformers.dynamic_module_utils import custom_object_save
+        custom_object_save(model, save_directory, config=model.config)
+
     if model.config.model_type == "qwen2":
         if group_size == 0:
             if model.config.hidden_size == 1536: