From f05a0e0a00848bace1484bba7c23dd2ea498a35f Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 16 Jun 2024 20:18:27 +0200 Subject: [PATCH] Add --pre-tokenizer option to convert --- convert-hf-to-gguf-update.py | 3 +++ convert-hf-to-gguf.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index fbf1e1ea3de37..439a20bf68c78 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -173,6 +173,9 @@ def download_model(model): src_func = f""" def get_vocab_base_pre(self, tokenizer) -> str: + if self.pre_tokenizer is not None: + return self.pre_tokenizer + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 55ce502dba1c7..bf4a8dc814836 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -65,7 +65,7 @@ class Model: # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None, pre_tokenizer: str | None): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -73,6 +73,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.is_big_endian = is_big_endian self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file + self.pre_tokenizer = pre_tokenizer self.lazy = not eager self.model_name = model_name self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") @@ -405,6 +406,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # ref: https://github.com/ggerganov/llama.cpp/pull/6920 # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: + if self.pre_tokenizer is not None: + return self.pre_tokenizer # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can @@ -2800,6 +2803,10 @@ def parse_args() -> argparse.Namespace: "--model-name", type=str, default=None, help="name of the model", ) + parser.add_argument( + "--pre-tokenizer", type=str, default=None, + help="overwrite pre-tokenizer, if not specified this script will try to detect it automatically" + ) parser.add_argument( "--verbose", action="store_true", help="increase output verbosity", @@ -2857,7 +2864,7 @@ def main() -> None: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name, args.pre_tokenizer) logger.info("Set model parameters") model_instance.set_gguf_parameters()