Nexesenex · Nexesenex · Jul 21, 2024 · Jul 20, 2024 · Jul 21, 2024 · Jul 21, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -860,7 +860,7 @@ jobs:
           mkdir build
           cd build
           cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
 
       - name: Determine tag name
         id: tag

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -48,7 +48,7 @@ class Model:
 
     dir_model: Path
     ftype: gguf.LlamaFileType
-    fname_out: Path | None
+    fname_out: Path
     is_big_endian: bool
     endianess: gguf.GGUFEndian
     use_temp_file: bool
@@ -62,11 +62,12 @@ class Model:
     gguf_writer: gguf.GGUFWriter
     model_name: str | None
     metadata_override: Path | None
+    dir_model_card: Path
 
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False,
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                  use_temp_file: bool = False, eager: bool = False,
                  metadata_override: Path | None = None, model_name: str | None = None,
                  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@@ -90,6 +91,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path |
         self.tensor_names = None
         self.metadata_override = metadata_override
         self.model_name = model_name
+        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
 
         # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
         if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -345,7 +347,7 @@ def prepare_metadata(self, vocab_only: bool):
 
         total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
 
-        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params)
+        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
 
         # Fallback to model directory name if metadata name is still missing
         if self.metadata.name is None:
@@ -359,27 +361,22 @@ def prepare_metadata(self, vocab_only: bool):
         output_type: str = self.ftype.name.partition("_")[2]
 
         # Filename Output
-        # Note: `not is_dir()` is used because `.is_file()` will not detect
-        #       file template strings as it doesn't actually exist as a file
-        if self.fname_out is not None and not self.fname_out.is_dir():
-            # Output path is a custom defined templated filename
-
-            # Process templated file name with the output ftype, useful with the "auto" ftype
-            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
-        else:
+        if self.fname_out.is_dir():
             # Generate default filename based on model specification and available metadata
             if not vocab_only:
                 fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
             else:
                 fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
 
-            # Check if preferred output directory path was provided
-            if self.fname_out is not None and self.fname_out.is_dir():
-                # output path is a directory
-                self.fname_out = self.fname_out / f"{fname_default}.gguf"
-            else:
-                # output in the same directory as the model by default
-                self.fname_out = self.dir_model / f"{fname_default}.gguf"
+            # Use the default filename
+            self.fname_out = self.fname_out / f"{fname_default}.gguf"
+        else:
+            # Output path is a custom defined templated filename
+            # Note: `not is_dir()` is used because `.is_file()` will not detect
+            #       file template strings as it doesn't actually exist as a file
+
+            # Process templated file name with the output ftype, useful with the "auto" ftype
+            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
 
         self.set_type()
 
@@ -753,7 +750,8 @@ def _create_vocab_sentencepiece(self):
                     token_id = int(token_id)
                     token: str = token_data["content"]
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token.encode("utf-8")
+                        if tokens[token_id] != token.encode("utf-8"):
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
                     if token_data.get("special") or self.does_token_look_special(token):
                         toktypes[token_id] = SentencePieceTokenTypes.CONTROL
                     else:
@@ -1312,6 +1310,7 @@ def set_vocab(self):
         special_vocab._set_special_token("prefix", 1)
         special_vocab._set_special_token("suffix", 3)
         special_vocab._set_special_token("middle", 2)
+        special_vocab.chat_template = None  # do not add it twice
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
@@ -2014,7 +2013,8 @@ def set_vocab(self):
                     token_id = int(token_id)
                     token = foken_data["content"].encode("utf-8")
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2030,7 +2030,8 @@ def set_vocab(self):
                     token_id = int(foken_data["id"])
                     token = foken_data["content"].encode("utf-8")
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2269,7 +2270,8 @@ def set_vocab(self):
                         chat_eos_token_id = token_id
                     token = token.encode("utf-8")
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert(tokens[token_id] == token)
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2288,7 +2290,8 @@ def set_vocab(self):
                         chat_eos_token_id = token_id
                     token = token.encode("utf-8")
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert(tokens[token_id] == token)
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2474,6 +2477,7 @@ def set_vocab(self):
         special_vocab._set_special_token("middle", 68)
         special_vocab._set_special_token("fsep",   70)
         special_vocab._set_special_token("eot",    107)
+        special_vocab.chat_template = None  # do not add it twice
         special_vocab.add_to_gguf(self.gguf_writer)
 
         self.gguf_writer.add_add_space_prefix(False)
@@ -3627,10 +3631,10 @@ def main() -> None:
         logger.error("Error: Cannot use temp file when splitting")
         sys.exit(1)
 
-    fname_out = None
-
     if args.outfile is not None:
         fname_out = args.outfile
+    else:
+        fname_out = dir_model
 
     logger.info(f"Loading model: {dir_model.name}")
 
@@ -3661,7 +3665,6 @@ def main() -> None:
         else:
             logger.info("Exporting model...")
             model_instance.write()
-            assert model_instance.fname_out is not None
             out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
             logger.info(f"Model successfully exported to {out_path}")
 

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
@@ -290,7 +290,7 @@ def parse_args() -> argparse.Namespace:
         fname_out = args.outfile
     else:
         # output in the same directory as the model by default
-        fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
+        fname_out = dir_lora
 
     if os.path.exists(input_model):
         # lazy import load_file only if lora is in safetensors format.
@@ -304,12 +304,6 @@ def parse_args() -> argparse.Namespace:
     # load base model
     logger.info(f"Loading base model: {dir_base_model.name}")
     hparams = Model.load_hparams(dir_base_model)
-
-    with open(lora_config, "r") as f:
-        lparams: dict[str, Any] = json.load(f)
-
-    alpha: float = lparams["lora_alpha"]
-
     with torch.inference_mode():
         try:
             model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -320,12 +314,21 @@ def parse_args() -> argparse.Namespace:
         class LoraModel(model_class):
             model_arch = model_class.model_arch
 
+            lora_alpha: float
+
+            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
+
+                super().__init__(*args, **kwargs)
+
+                self.dir_model_card = dir_lora_model
+                self.lora_alpha = float(lora_alpha)
+
             def set_type(self):
                 self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
                 self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
 
             def set_gguf_parameters(self):
-                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
+                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
                 super().set_gguf_parameters()
 
             def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
@@ -368,6 +371,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                     yield (dest_name + ".lora_a", lora_a)
                     yield (dest_name + ".lora_b", lora_b)
 
+        with open(lora_config, "r") as f:
+            lparams: dict[str, Any] = json.load(f)
+
+        alpha: float = lparams["lora_alpha"]
+
         model_instance = LoraModel(
             dir_base_model,
             ftype,
@@ -376,6 +384,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             use_temp_file=False,
             eager=args.no_lazy,
             dry_run=args.dry_run,
+            dir_lora_model=dir_lora,
+            lora_alpha=alpha,
         )
 
         logger.info("Exporting model...")