Add --pre-tokenizer option to convert

ggerganov · Jun 16, 2024 · f05a0e0 · f05a0e0
1 parent bc6c457
commit f05a0e0
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 2 deletions.
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -173,6 +173,9 @@ def download_model(model):
 
 src_func = f"""
     def get_vocab_base_pre(self, tokenizer) -> str:
+        if self.pre_tokenizer is not None:
+            return self.pre_tokenizer
+
         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
         # is specific for the BPE pre-tokenizer used by the model
         # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -65,14 +65,15 @@ class Model:
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None, pre_tokenizer: str | None):
         if type(self) is Model:
             raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
         self.dir_model = dir_model
         self.ftype = ftype
         self.is_big_endian = is_big_endian
         self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
         self.use_temp_file = use_temp_file
+        self.pre_tokenizer = pre_tokenizer
         self.lazy = not eager
         self.model_name = model_name
         self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
@@ -405,6 +406,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
     # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
     # Marker: Start get_vocab_base_pre
     def get_vocab_base_pre(self, tokenizer) -> str:
+        if self.pre_tokenizer is not None:
+            return self.pre_tokenizer
         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
         # is specific for the BPE pre-tokenizer used by the model
         # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
@@ -2800,6 +2803,10 @@ def parse_args() -> argparse.Namespace:
         "--model-name", type=str, default=None,
         help="name of the model",
     )
+    parser.add_argument(
+        "--pre-tokenizer", type=str, default=None,
+        help="overwrite pre-tokenizer, if not specified this script will try to detect it automatically"
+    )
     parser.add_argument(
         "--verbose", action="store_true",
         help="increase output verbosity",
@@ -2857,7 +2864,7 @@ def main() -> None:
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)
 
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name, args.pre_tokenizer)
 
         logger.info("Set model parameters")
         model_instance.set_gguf_parameters()