From f7e377d62f20d92a05b4c95647efeff4c2976c18 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 09:32:22 +0000 Subject: [PATCH 01/30] Add HFVocab into convert.py --- convert.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/convert.py b/convert.py index e9b08d344f5bd..d6e6558c25c08 100755 --- a/convert.py +++ b/convert.py @@ -414,7 +414,54 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def __repr__(self) -> str: return f"" -Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' +class HFVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: + self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) + + added_tokens: dict[str, int] + if fname_added_tokens is not None: + added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) + else: + added_tokens = {} + + vocab_size: int = self.tokenizer.vocab_size + + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) + if expected_ids != actual_ids: + raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") + + items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base: int = vocab_size + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens + + def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + tokenizer = self.tokenizer + reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + byte_encoder = bytes_to_unicode() + byte_decoder = {v: k for k, v in byte_encoder.items()} + print(len(byte_decoder), byte_decoder) + + for i in range(tokenizer.vocab_size): + text = reverse_vocab[i].encode("utf-8") + yield text, 0.0, gguf.TokenType.NORMAL + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + yield from self.hf_tokens() + yield from self.added_tokens() + + def __repr__(self) -> str: + return f"" + +Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab | HFVocab' # # data loading @@ -1084,6 +1131,8 @@ def load_vocab(path: Path, vocabtype: str | None) -> Vocab: return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None) elif vocabtype == "spm": return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) + elif vocabtype == "hf": + return HFVocab(path, added_tokens_path if added_tokens_path.exists() else None) else: raise ValueError(f"Unsupported vocabulary type {vocabtype}") @@ -1120,7 +1169,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") - parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") + parser.add_argument("--vocabtype", choices=["spm", "bpe", "hf"], help="vocab format (default: spm)", default="spm") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) args = parser.parse_args(args_in) @@ -1162,7 +1211,7 @@ def main(args_in: list[str] | None = None) -> None: assert args.outfile, "need --outfile if using --vocab-only" # FIXME: Try to respect vocab_dir somehow? vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) - special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') + special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype in ('bpe', 'hf')) outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) print(f"Wrote {outfile}") @@ -1174,7 +1223,7 @@ def main(args_in: list[str] | None = None) -> None: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab = load_vocab(vocab_dir, args.vocabtype) # FIXME: Try to respect vocab_dir somehow? - special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') + special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype in ('bpe', 'hf')) model = model_plus.model model = convert_model_names(model, params) From b0e00cb868b414cd135fc8a636343b2a583bc87e Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 09:38:58 +0000 Subject: [PATCH 02/30] Update convert.py --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index d6e6558c25c08..53204f21cdad4 100755 --- a/convert.py +++ b/convert.py @@ -1108,7 +1108,7 @@ def load_vocab(path: Path, vocabtype: str | None) -> Vocab: # Be extra-friendly and accept either a file or a directory. Also, if it's # a directory, it might be the model directory, and tokenizer.model might # be in the parent of that. - if path.is_dir(): + if path.is_dir() and vocabtype != 'hf': vocab_file = "tokenizer.model" if vocabtype == 'bpe': vocab_file = "vocab.json" From f888d2ea13d6f2838b5f7f74118c8fd7de14e0b9 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 09:40:14 +0000 Subject: [PATCH 03/30] Update convert.py --- convert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert.py b/convert.py index 53204f21cdad4..34f380cd70eee 100755 --- a/convert.py +++ b/convert.py @@ -27,6 +27,7 @@ import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore[import] +from transformers import AutoTokenizer import os if 'NO_LOCAL_GGUF' not in os.environ: From ea9f35f082c3ff377055af462a4f5e94866240e7 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 09:42:03 +0000 Subject: [PATCH 04/30] add bytes_to_unicode function --- convert.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/convert.py b/convert.py index 34f380cd70eee..98ea6d9c8d2c7 100755 --- a/convert.py +++ b/convert.py @@ -301,6 +301,27 @@ def load(model_plus: ModelPlus) -> Params: # # vocab # +def bytes_to_unicode(): + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + return dict(zip(bs, (chr(n) for n in cs))) + class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: From c7b636e9507a9ae7dd81242004ab1c9497ea942a Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 09:43:22 +0000 Subject: [PATCH 05/30] change add_meta_vocab fucntion --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index 98ea6d9c8d2c7..e84db460f9c20 100755 --- a/convert.py +++ b/convert.py @@ -913,12 +913,12 @@ def add_meta_vocab(self, vocab: Vocab) -> None: scores.append(score) toktypes.append(toktype) - if isinstance(vocab, SentencePieceVocab): + if isinstance(vocab, SentencePieceVocab) or isinstance(vocab, HFVocab): self.gguf.add_tokenizer_model("llama") elif isinstance(vocab, BpeVocab): self.gguf.add_tokenizer_model("gpt2") else: - raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab') + raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab or HFVocab') self.gguf.add_token_list(tokens) self.gguf.add_token_scores(scores) self.gguf.add_token_types(toktypes) From 6ec856b3efab650c74e659db7393efee7c184c5e Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 09:44:07 +0000 Subject: [PATCH 06/30] remove debug code --- convert.py | 1 - 1 file changed, 1 deletion(-) diff --git a/convert.py b/convert.py index e84db460f9c20..4868b66f9fe88 100755 --- a/convert.py +++ b/convert.py @@ -465,7 +465,6 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} - print(len(byte_decoder), byte_decoder) for i in range(tokenizer.vocab_size): text = reverse_vocab[i].encode("utf-8") From 1f16e5f234d9466d5443e7cc507247e300de16c0 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 09:46:48 +0000 Subject: [PATCH 07/30] remove byte_encoder --- convert.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/convert.py b/convert.py index 4868b66f9fe88..955b6546fc496 100755 --- a/convert.py +++ b/convert.py @@ -301,28 +301,6 @@ def load(model_plus: ModelPlus) -> Params: # # vocab # -def bytes_to_unicode(): - # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - return dict(zip(bs, (chr(n) for n in cs))) - - class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) @@ -463,8 +441,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - byte_encoder = bytes_to_unicode() - byte_decoder = {v: k for k, v in byte_encoder.items()} for i in range(tokenizer.vocab_size): text = reverse_vocab[i].encode("utf-8") From e876aec1893313c3c164f2621c2cd5a6a5d78720 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 10:37:41 +0000 Subject: [PATCH 08/30] Add newline between classes --- convert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert.py b/convert.py index 955b6546fc496..4077be7f2b117 100755 --- a/convert.py +++ b/convert.py @@ -414,6 +414,7 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def __repr__(self) -> str: return f"" + class HFVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) From 177845089f03ed5bd528ecd307dee6d4bcbbbdeb Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 15 Oct 2023 11:34:25 +0000 Subject: [PATCH 09/30] Check tokenizer.json when tokenizer.model is not exist. --- convert.py | 54 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/convert.py b/convert.py index 4077be7f2b117..8b3e929235c7e 100755 --- a/convert.py +++ b/convert.py @@ -1102,25 +1102,51 @@ def load_some_model(path: Path) -> ModelPlus: return model_plus +def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool: + path2 = path / vocab_file + # Use `.parent` instead of /.. to handle the symlink case better. + path3 = path.parent / vocab_file + + if path2.exists(): + path = path2 + elif path3.exists(): + path = path3 + else: + path = None + + return path + + + def load_vocab(path: Path, vocabtype: str | None) -> Vocab: # Be extra-friendly and accept either a file or a directory. Also, if it's # a directory, it might be the model directory, and tokenizer.model might # be in the parent of that. - if path.is_dir() and vocabtype != 'hf': + if path.is_dir(): + find_candidates = [] + vocab_file = "tokenizer.model" - if vocabtype == 'bpe': + if vocabtype == "bpe": vocab_file = "vocab.json" - path2 = path / vocab_file - # Use `.parent` instead of /.. to handle the symlink case better. - path3 = path.parent / vocab_file - if path2.exists(): - path = path2 - elif path3.exists(): - path = path3 + + path_candidate = vocab_check_and_append_path(path, vocab_file) + find_candidates.append(vocab_file) + + if path_candidate is None: + vocab_file = "tokenizer.json" + hf_path = vocab_check_and_append_path(path, vocab_file) + find_candidates.append(vocab_file) + + if vocabtype == "spm" and hf_path is not None: + # A case where there is no tokenizer.model but there is a tokenizer.json and it needs to be loaded into HFVocab. + vocabtype = "hf" + else: + raise FileNotFoundError( + f"Could not find {find_candidates} in {path} or its parent; " + "if it's in another directory, pass the directory as --vocab-dir") else: - raise FileNotFoundError( - f"Could not find {vocab_file} in {path} or its parent; " - "if it's in another directory, pass the directory as --vocab-dir") + path = path_candidate + print(f"Loading vocab file '{path}', type '{vocabtype}'") @@ -1209,7 +1235,7 @@ def main(args_in: list[str] | None = None) -> None: assert args.outfile, "need --outfile if using --vocab-only" # FIXME: Try to respect vocab_dir somehow? vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) - special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype in ('bpe', 'hf')) + special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab)) outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) print(f"Wrote {outfile}") @@ -1221,7 +1247,7 @@ def main(args_in: list[str] | None = None) -> None: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab = load_vocab(vocab_dir, args.vocabtype) # FIXME: Try to respect vocab_dir somehow? - special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype in ('bpe', 'hf')) + special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab)) model = model_plus.model model = convert_model_names(model, params) From a5b26b660bb92f8f2f206179b9e98dab814f9836 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Wed, 18 Oct 2023 01:01:50 +0000 Subject: [PATCH 10/30] Move transformers dependency to local code --- convert.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index 8b3e929235c7e..c742a1a00ec7b 100755 --- a/convert.py +++ b/convert.py @@ -27,7 +27,6 @@ import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore[import] -from transformers import AutoTokenizer import os if 'NO_LOCAL_GGUF' not in os.environ: @@ -417,6 +416,14 @@ def __repr__(self) -> str: class HFVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: + try: + from transformers import AutoTokenizer + except ModuleNotFoundError: + raise ImportError( + "To use HFVocab, please install the `transformers` package. " + "You can install it with `pip install transformers`." + ) + self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) added_tokens: dict[str, int] @@ -438,7 +445,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens - def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} From 5a1f178091bf0d0be985f91d4a7f520ef156a122 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Wed, 18 Oct 2023 01:30:38 +0000 Subject: [PATCH 11/30] Add error context with 'raise from' --- convert.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index c742a1a00ec7b..28de7774c417e 100755 --- a/convert.py +++ b/convert.py @@ -418,11 +418,11 @@ class HFVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: try: from transformers import AutoTokenizer - except ModuleNotFoundError: + except ImportError as e: raise ImportError( "To use HFVocab, please install the `transformers` package. " "You can install it with `pip install transformers`." - ) + ) from e self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) @@ -445,6 +445,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens + def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} From 89611cb05a6ff6da1ad0e1fbd346f504e9dc999f Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Mon, 23 Oct 2023 04:15:43 +0000 Subject: [PATCH 12/30] Add fast tokenizer option to BpeVocab --- convert.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/convert.py b/convert.py index 28de7774c417e..b31fb4057e311 100755 --- a/convert.py +++ b/convert.py @@ -302,23 +302,35 @@ def load(model_plus: ModelPlus) -> Params: # class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) + fast_tokenizer = fname_tokenizer.name == 'tokenizer.json' + tokenizer_json = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) + + if fast_tokenizer: + self.bpe_tokenizer = tokenizer_json['model']['vocab'] + else: + self.bpe_tokenizer = tokenizer_json + added_tokens: dict[str, int] if fname_added_tokens is not None: - # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: - # Fall back to trying to find the added tokens in tokenizer.json - tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' - if not tokenizer_json_file.is_file(): - added_tokens = {} - else: - tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) - added_tokens = dict( - (item['content'], item['id']) - for item in tokenizer_json.get('added_tokens', []) - # Added tokens here can be duplicates of the main vocabulary. - if item['content'] not in self.bpe_tokenizer ) + if not fast_tokenizer: + tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' + + if not tokenizer_json_file.is_file(): + added_tokens = {} + else: + tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) + + added_tokens = dict( + (item['content'], item['id']) + for item in tokenizer_json.get('added_tokens', [])) + + added_tokens = dict( + (token_content, token_id) + for token_content, token_id in added_tokens.items() + # Added tokens here can be duplicates of the main vocabulary. + if token_content not in self.bpe_tokenizer) vocab_size: int = len(self.bpe_tokenizer) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) From e71544231c53e679fb7471129b22f783a8a159c1 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 29 Oct 2023 18:29:38 +0000 Subject: [PATCH 13/30] Update convert.py --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 6a162c4d66379..77a04062d67f8 100755 --- a/convert.py +++ b/convert.py @@ -1159,7 +1159,7 @@ def load_vocab(path: Path, vocabtype: str | None) -> Vocab: hf_path = vocab_check_and_append_path(path, vocab_file) find_candidates.append(vocab_file) - if vocabtype == "spm" and hf_path is not None: + if hf_path is not None: # A case where there is no tokenizer.model but there is a tokenizer.json and it needs to be loaded into HFVocab. vocabtype = "hf" else: From d54764d0b1d2f4a1e3fe6736783afff1bd81a20d Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Mon, 30 Oct 2023 11:54:59 +0000 Subject: [PATCH 14/30] Add VocabLoader and remove *Vocab class --- convert.py | 246 +++++++++++------------------------------------------ 1 file changed, 50 insertions(+), 196 deletions(-) diff --git a/convert.py b/convert.py index 77a04062d67f8..a61853afe894b 100755 --- a/convert.py +++ b/convert.py @@ -297,155 +297,24 @@ def load(model_plus: ModelPlus) -> Params: return params -# -# vocab -# -class BpeVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - fast_tokenizer = fname_tokenizer.name == 'tokenizer.json' - tokenizer_json = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - - if fast_tokenizer: - self.bpe_tokenizer = tokenizer_json['model']['vocab'] - else: - self.bpe_tokenizer = tokenizer_json - - added_tokens: dict[str, int] - if fname_added_tokens is not None: - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - if not fast_tokenizer: - tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' - - if not tokenizer_json_file.is_file(): - added_tokens = {} - else: - tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) - - added_tokens = dict( - (item['content'], item['id']) - for item in tokenizer_json.get('added_tokens', [])) - - added_tokens = dict( - (token_content, token_id) - for token_content, token_id in added_tokens.items() - # Added tokens here can be duplicates of the main vocabulary. - if token_content not in self.bpe_tokenizer) - - vocab_size: int = len(self.bpe_tokenizer) - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - expected_end_id = vocab_size + len(actual_ids) - 1 - raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") - - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - - def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.bpe_tokenizer - from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} - - for i, _ in enumerate(tokenizer): - yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.CONTROL - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.bpe_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -class SentencePieceVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) - added_tokens: dict[str, int] - if fname_added_tokens is not None: - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - added_tokens = {} - - vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - - new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} - expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) - - if expected_new_ids != actual_new_ids: - raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") - - # Token pieces that were added to the base vocabulary. - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - - def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.sentencepiece_tokenizer - for i in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(i) - text: bytes = piece.encode("utf-8") - score: float = tokenizer.get_score(i) - - toktype = gguf.TokenType.NORMAL - if tokenizer.is_unknown(i): - toktype = gguf.TokenType.UNKNOWN - if tokenizer.is_control(i): - toktype = gguf.TokenType.CONTROL - - # NOTE: I think added_tokens are user defined. - # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto - # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED - - if tokenizer.is_unused(i): - toktype = gguf.TokenType.UNUSED - if tokenizer.is_byte(i): - toktype = gguf.TokenType.BYTE - - yield text, score, toktype - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED - - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - yield from self.sentencepiece_tokens() - yield from self.added_tokens() - - def __repr__(self) -> str: - return f"" - - -class HFVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: +class VocabLoader: + def __init__(self, fname_tokenizer: Path) -> None: try: from transformers import AutoTokenizer except ImportError as e: raise ImportError( - "To use HFVocab, please install the `transformers` package. " + "To use VocabLoader, please install the `transformers` package. " "You can install it with `pip install transformers`." ) from e self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) + vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} - added_tokens: dict[str, int] - if fname_added_tokens is not None: - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - added_tokens = {} + added_tokens = { + token: tid + for token, tid in self.tokenizer.get_added_vocab().items() + if token not in vocab_set + } vocab_size: int = self.tokenizer.vocab_size @@ -459,7 +328,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No self.vocab_size_base: int = vocab_size self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer @@ -478,10 +346,34 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.hf_tokens() yield from self.added_tokens() + def get_vocab_type(self) -> str: + path_candidates = [] + vocab_file = "tokenizer.model" + path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) + if path_candidate is not None: + return "llama" + + path_candidates.append(path_candidate) + vocab_file = "vocab.json" + path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) + if path_candidate is not None: + return "gpt2" + + path_candidates.append(path_candidate) + vocab_file = "tokenizer.json" + path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) + if path_candidate: + return "llama" + + path_candidates.append(path_candidate) + raise FileNotFoundError( + f"Could not find {find_candidates} in {path} or its parent; " + "if it's in another directory, pass the directory as --vocab-dir") + def __repr__(self) -> str: - return f"" + return f"" -Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab | HFVocab' +Vocab: TypeAlias = 'VocabLoader' # # data loading @@ -854,17 +746,14 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: - assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) if params.n_vocab == vocab.vocab_size_base: print("Ignoring added_tokens.json since model matches vocab size without it.") vocab.added_tokens_list = [] vocab.vocab_size = vocab.vocab_size_base return msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" - if vocab.fname_added_tokens is not None: - msg += f" combined with {vocab.fname_added_tokens}" msg += f" has {vocab.vocab_size})." - if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None: + if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20: msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." raise Exception(msg) @@ -911,12 +800,9 @@ def add_meta_vocab(self, vocab: Vocab) -> None: scores.append(score) toktypes.append(toktype) - if isinstance(vocab, SentencePieceVocab) or isinstance(vocab, HFVocab): - self.gguf.add_tokenizer_model("llama") - elif isinstance(vocab, BpeVocab): - self.gguf.add_tokenizer_model("gpt2") - else: - raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab or HFVocab') + vocab_type = vocab.get_vocab_type() + self.gguf.add_tokenizer_model(vocab_type) + self.gguf.add_token_list(tokens) self.gguf.add_token_scores(scores) self.gguf.add_token_types(toktypes) @@ -1137,50 +1023,16 @@ def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool: path = None return path - - -def load_vocab(path: Path, vocabtype: str | None) -> Vocab: + +def load_vocab(path: Path) -> Vocab: # Be extra-friendly and accept either a file or a directory. Also, if it's # a directory, it might be the model directory, and tokenizer.model might # be in the parent of that. - if path.is_dir(): - find_candidates = [] - - vocab_file = "tokenizer.model" - if vocabtype == "bpe": - vocab_file = "vocab.json" - - path_candidate = vocab_check_and_append_path(path, vocab_file) - find_candidates.append(vocab_file) - - if path_candidate is None: - vocab_file = "tokenizer.json" - hf_path = vocab_check_and_append_path(path, vocab_file) - find_candidates.append(vocab_file) - - if hf_path is not None: - # A case where there is no tokenizer.model but there is a tokenizer.json and it needs to be loaded into HFVocab. - vocabtype = "hf" - else: - raise FileNotFoundError( - f"Could not find {find_candidates} in {path} or its parent; " - "if it's in another directory, pass the directory as --vocab-dir") - else: - path = path_candidate + print(f"Loading vocab file '{path}'") - print(f"Loading vocab file '{path}', type '{vocabtype}'") - - added_tokens_path = path.parent / "added_tokens.json" - if vocabtype == "bpe": - return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None) - elif vocabtype == "spm": - return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) - elif vocabtype == "hf": - return HFVocab(path, added_tokens_path if added_tokens_path.exists() else None) - else: - raise ValueError(f"Unsupported vocabulary type {vocabtype}") + return VocabLoader(path) def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: @@ -1215,7 +1067,6 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") - parser.add_argument("--vocabtype", choices=["spm", "bpe", "hf"], help="vocab format (default: spm)", default="spm") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") @@ -1261,9 +1112,9 @@ def main(args_in: list[str] | None = None) -> None: if not args.outfile: raise ValueError("need --outfile if using --vocab-only") # FIXME: Try to respect vocab_dir somehow? - vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) + vocab = load_vocab(args.vocab_dir or args.model) special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, - load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab), + load_merges = True, n_vocab = vocab.vocab_size) outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) @@ -1274,12 +1125,15 @@ def main(args_in: list[str] | None = None) -> None: vocab = model_plus.vocab else: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent - vocab = load_vocab(vocab_dir, args.vocabtype) + vocab = load_vocab(vocab_dir) + # FIXME: Try to respect vocab_dir somehow? + print(f"Vocab info: {vocab}") special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, - load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab), + load_merges = True, n_vocab = vocab.vocab_size) + print(f"Special vocab info: {special_vocab}") model = model_plus.model model = convert_model_names(model, params) ftype = pick_output_type(model, args.outtype) From e19b78038a8fdef128006433118ce374f2e97b43 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Mon, 30 Oct 2023 11:56:49 +0000 Subject: [PATCH 15/30] Add transformers dependency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 81c909d0ba7fe..badfec3be804c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy==1.24.4 sentencepiece==0.1.98 +transformers>=4.34.0 gguf>=0.1.0 From 28f09beb60e56c5e1eea68223cd809da57d3c02d Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 5 Nov 2023 20:22:25 +0900 Subject: [PATCH 16/30] remove added tokens and check newline token to decide spm or bpe --- convert.py | 46 +++++++++++++++------------------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/convert.py b/convert.py index a61853afe894b..d4d2a5a9b74c9 100755 --- a/convert.py +++ b/convert.py @@ -309,42 +309,23 @@ def __init__(self, fname_tokenizer: Path) -> None: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} - - added_tokens = { - token: tid - for token, tid in self.tokenizer.get_added_vocab().items() - if token not in vocab_set - } - vocab_size: int = self.tokenizer.vocab_size - - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) - if expected_ids != actual_ids: - raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}") - - items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size: int = len(self.tokenizer.vocab) self.fname_tokenizer = fname_tokenizer def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - for i in range(tokenizer.vocab_size): + for i in range(self.vocab_size): text = reverse_vocab[i].encode("utf-8") yield text, 0.0, gguf.TokenType.NORMAL - - def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + + def has_newline_token(self): + return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.hf_tokens() - yield from self.added_tokens() def get_vocab_type(self) -> str: path_candidates = [] @@ -352,26 +333,29 @@ def get_vocab_type(self) -> str: path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: return "llama" - + path_candidates.append(path_candidate) vocab_file = "vocab.json" path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: return "gpt2" - + path_candidates.append(path_candidate) vocab_file = "tokenizer.json" path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) if path_candidate: - return "llama" - + if not self.tokenizer.can_save_slow_tokenizer(): + return "gpt2" + else: + return "llama" + path_candidates.append(path_candidate) raise FileNotFoundError( f"Could not find {find_candidates} in {path} or its parent; " "if it's in another directory, pass the directory as --vocab-dir") def __repr__(self) -> str: - return f"" + return f"" Vocab: TypeAlias = 'VocabLoader' @@ -746,10 +730,10 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: - if params.n_vocab == vocab.vocab_size_base: + if params.n_vocab == vocab.vocab_size: print("Ignoring added_tokens.json since model matches vocab size without it.") vocab.added_tokens_list = [] - vocab.vocab_size = vocab.vocab_size_base + vocab.vocab_size = vocab.vocab_size return msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" msg += f" has {vocab.vocab_size})." From 4adb8b986217b789423a1c11b2d4ce3cb131fd06 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 5 Nov 2023 22:41:40 +0900 Subject: [PATCH 17/30] Update convert.py --- convert.py | 53 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/convert.py b/convert.py index d4d2a5a9b74c9..7e43bfb01a779 100755 --- a/convert.py +++ b/convert.py @@ -310,6 +310,8 @@ def __init__(self, fname_tokenizer: Path) -> None: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} + self.added_tokens_list = [] + self.vocab_size_base: int = len(self.tokenizer.vocab) self.vocab_size: int = len(self.tokenizer.vocab) self.fname_tokenizer = fname_tokenizer @@ -317,15 +319,21 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - for i in range(self.vocab_size): + for i in range(self.vocab_size_base): text = reverse_vocab[i].encode("utf-8") yield text, 0.0, gguf.TokenType.NORMAL - + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + def has_newline_token(self): return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.hf_tokens() + yield from self.added_tokens() def get_vocab_type(self) -> str: path_candidates = [] @@ -344,7 +352,7 @@ def get_vocab_type(self) -> str: vocab_file = "tokenizer.json" path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) if path_candidate: - if not self.tokenizer.can_save_slow_tokenizer(): + if not self.has_newline_token(): return "gpt2" else: return "llama" @@ -355,7 +363,7 @@ def get_vocab_type(self) -> str: "if it's in another directory, pass the directory as --vocab-dir") def __repr__(self) -> str: - return f"" + return f"" Vocab: TypeAlias = 'VocabLoader' @@ -728,17 +736,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc break yield result -def check_vocab_size(params: Params, vocab: Vocab) -> None: +def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: if params.n_vocab != vocab.vocab_size: if params.n_vocab == vocab.vocab_size: print("Ignoring added_tokens.json since model matches vocab size without it.") vocab.added_tokens_list = [] vocab.vocab_size = vocab.vocab_size return + + if pad_vocab and params.n_vocab > vocab.vocab_size: + pad_count = params.n_vocab - vocab.vocab_size + print(f'Padding vocab with {pad_count} token(s) - through ') + for i in range(1, (params.n_vocab - vocab.vocab_size) + 1): + vocab.added_tokens_list.append(f'') + vocab.vocab_size = params.n_vocab + return msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" msg += f" has {vocab.vocab_size})." if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20: msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." + if vocab.vocab_size < params.n_vocab: + msg += " Possibly try using the --padvocab option." raise Exception(msg) @@ -812,8 +830,12 @@ def close(self) -> None: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: - check_vocab_size(params, vocab) + def write_vocab_only( + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -840,8 +862,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: return dt.quantize(arr) @staticmethod - def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None: - check_vocab_size(params, vocab) + def write_all( + fname_out : Path, ftype: GGMLFileType, params: Params, + model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, + endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab : bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1054,6 +1082,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") args = parser.parse_args(args_in) if args.dump_single: @@ -1101,7 +1130,8 @@ def main(args_in: list[str] | None = None) -> None: load_merges = True, n_vocab = vocab.vocab_size) outfile = args.outfile - OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, + endianess = endianess, pad_vocab = args.padvocab) print(f"Wrote {outfile}") return @@ -1127,7 +1157,8 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab) print(f"Wrote {outfile}") From 13f07013ee71cf533b712fbc0fd167428d2a4e8e Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 5 Nov 2023 23:18:26 +0900 Subject: [PATCH 18/30] Add special token type --- convert.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/convert.py b/convert.py index 7e43bfb01a779..c4eb28391bcdc 100755 --- a/convert.py +++ b/convert.py @@ -311,17 +311,18 @@ def __init__(self, fname_tokenizer: Path) -> None: vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} self.added_tokens_list = [] - self.vocab_size_base: int = len(self.tokenizer.vocab) - self.vocab_size: int = len(self.tokenizer.vocab) + self.vocab_size_base: int = len(vocab_set) + self.vocab_size: int = len(vocab_set) self.fname_tokenizer = fname_tokenizer def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + special_ids = set(tokenizer.all_special_ids) for i in range(self.vocab_size_base): text = reverse_vocab[i].encode("utf-8") - yield text, 0.0, gguf.TokenType.NORMAL + yield text, 0.0, gguf.TokenType.NORMAL if i not in special_ids else gguf.TokenType.CONTROL def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: From f37a7d7028d0df1f5634984c95cd634402326244 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 12 Nov 2023 02:22:37 +0900 Subject: [PATCH 19/30] Update convert.py --- convert.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/convert.py b/convert.py index c4eb28391bcdc..f72ed9ab5a0c9 100755 --- a/convert.py +++ b/convert.py @@ -311,9 +311,18 @@ def __init__(self, fname_tokenizer: Path) -> None: vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} self.added_tokens_list = [] + self.unk_token_id = self.tokenizer.unk_token_id + self.special_ids = set(self.tokenizer.all_special_ids) self.vocab_size_base: int = len(vocab_set) self.vocab_size: int = len(vocab_set) self.fname_tokenizer = fname_tokenizer + + vocab_file = "tokenizer.model" + path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) + if path_candidate is not None: + self.spm = SentencePieceProcessor(str(path_candidate)) + else: + self.spm def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer @@ -322,7 +331,32 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for i in range(self.vocab_size_base): text = reverse_vocab[i].encode("utf-8") - yield text, 0.0, gguf.TokenType.NORMAL if i not in special_ids else gguf.TokenType.CONTROL + yield text, self.get_token_score(i), self.get_token_type(i) + + def get_token_type(self, token_id): + toktype = gguf.TokenType.NORMAL + + if self.spm is None: + if i == self.unk_token_id: + toktype = gguf.TokenType.UNKNOWN + if i in self.special_ids: + toktype = gguf.TokenType.CONTROL + else: + if self.spm.is_unknown(token_id): + toktype = gguf.TokenType.UNKNOWN + if self.spm.is_control(token_id): + toktype = gguf.TokenType.CONTROL + if self.spm.is_unused(token_id): + toktype = gguf.TokenType.UNUSED + if self.spm.is_byte(token_id): + toktype = gguf.TokenType.BYTE + return toktype + + def get_token_score(self, token_id): + if self.spm is not None: + return self.spm.get_score(token_id) + else: + return 0.0 def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: From 9f4dc236a90f8af05daa20c62889805a9939d311 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 12 Nov 2023 03:23:41 +0900 Subject: [PATCH 20/30] Update convert.py --- convert.py | 48 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/convert.py b/convert.py index f72ed9ab5a0c9..c79318887a86e 100755 --- a/convert.py +++ b/convert.py @@ -310,8 +310,16 @@ def __init__(self, fname_tokenizer: Path) -> None: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} - self.added_tokens_list = [] + self.added_tokens_list = [tok for tok in self.tokenizer.get_added_vocab()] + self.added_tokens_dict = dict(self.tokenizer.get_added_vocab()) + self.added_tokens_ids = set(self.tokenizer.get_added_vocab().values()) + self.unk_token_id = self.tokenizer.unk_token_id + self.specials = { + tok: self.tokenizer.vocab[tok] + for tok in self.tokenizer.all_special_tokens + } + print(self.specials) self.special_ids = set(self.tokenizer.all_special_ids) self.vocab_size_base: int = len(vocab_set) self.vocab_size: int = len(vocab_set) @@ -321,6 +329,7 @@ def __init__(self, fname_tokenizer: Path) -> None: path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: self.spm = SentencePieceProcessor(str(path_candidate)) + print(self.spm.vocab_size(), self.vocab_size_base) else: self.spm @@ -330,18 +339,16 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: special_ids = set(tokenizer.all_special_ids) for i in range(self.vocab_size_base): + if i in self.added_tokens_ids: + continue + text = reverse_vocab[i].encode("utf-8") yield text, self.get_token_score(i), self.get_token_type(i) - def get_token_type(self, token_id): - toktype = gguf.TokenType.NORMAL + def get_token_type(self, token_id, default_type=gguf.TokenType.NORMAL): + toktype = default_type - if self.spm is None: - if i == self.unk_token_id: - toktype = gguf.TokenType.UNKNOWN - if i in self.special_ids: - toktype = gguf.TokenType.CONTROL - else: + if self.spm is not None and token_id < self.spm.vocab_size(): if self.spm.is_unknown(token_id): toktype = gguf.TokenType.UNKNOWN if self.spm.is_control(token_id): @@ -350,18 +357,35 @@ def get_token_type(self, token_id): toktype = gguf.TokenType.UNUSED if self.spm.is_byte(token_id): toktype = gguf.TokenType.BYTE + else: + if token_id == self.unk_token_id: + toktype = gguf.TokenType.UNKNOWN + if token_id in self.special_ids: + toktype = gguf.TokenType.CONTROL + return toktype def get_token_score(self, token_id): - if self.spm is not None: + if self.spm is not None and token_id < self.spm.vocab_size(): return self.spm.get_score(token_id) else: return 0.0 def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + default_toktype = gguf.TokenType.USER_DEFINED + for text in self.added_tokens_list: - score = -1000.0 - yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + + if text in self.specials: + + toktype = self.get_token_type(self.specials[text], default_toktype) + score = self.get_token_score(self.specials[text]) + + else: + toktype = default_toktype + score = -1000.0 + + yield text.encode("utf-8"), score, toktype def has_newline_token(self): return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab From dcf372e60ecd520f947e80eb95f8873c648b1613 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 12 Nov 2023 03:26:46 +0900 Subject: [PATCH 21/30] Update convert.py --- convert.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/convert.py b/convert.py index c79318887a86e..45123adb297d7 100755 --- a/convert.py +++ b/convert.py @@ -345,8 +345,8 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: text = reverse_vocab[i].encode("utf-8") yield text, self.get_token_score(i), self.get_token_type(i) - def get_token_type(self, token_id, default_type=gguf.TokenType.NORMAL): - toktype = default_type + def get_token_type(self, token_id): + toktype = gguf.TokenType.NORMAL if self.spm is not None and token_id < self.spm.vocab_size(): if self.spm.is_unknown(token_id): @@ -372,17 +372,16 @@ def get_token_score(self, token_id): return 0.0 def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - default_toktype = gguf.TokenType.USER_DEFINED for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type(self.specials[text], default_toktype) + toktype = self.get_token_type(self.specials[text]) score = self.get_token_score(self.specials[text]) else: - toktype = default_toktype + toktype = gguf.TokenType.USER_DEFINED score = -1000.0 yield text.encode("utf-8"), score, toktype From cc1f3fcfadd5321a2e2e8e4f5cdf0f1ba49d2468 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Wed, 15 Nov 2023 17:22:59 +0900 Subject: [PATCH 22/30] Fix typo in convert.py --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 45123adb297d7..ece023dc69a30 100755 --- a/convert.py +++ b/convert.py @@ -331,7 +331,7 @@ def __init__(self, fname_tokenizer: Path) -> None: self.spm = SentencePieceProcessor(str(path_candidate)) print(self.spm.vocab_size(), self.vocab_size_base) else: - self.spm + self.spm = None def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer From 026eb7cd01e6dd478357c60608511b0fd55f5301 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sat, 18 Nov 2023 12:55:14 +0900 Subject: [PATCH 23/30] Fix when params.n_vocab < tokenizer vocab size --- convert.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/convert.py b/convert.py index ece023dc69a30..ae47e198d3b65 100755 --- a/convert.py +++ b/convert.py @@ -298,7 +298,7 @@ def load(model_plus: ModelPlus) -> Params: class VocabLoader: - def __init__(self, fname_tokenizer: Path) -> None: + def __init__(self, params: Params, fname_tokenizer: Path) -> None: try: from transformers import AutoTokenizer except ImportError as e: @@ -309,10 +309,18 @@ def __init__(self, fname_tokenizer: Path) -> None: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} - - self.added_tokens_list = [tok for tok in self.tokenizer.get_added_vocab()] - self.added_tokens_dict = dict(self.tokenizer.get_added_vocab()) - self.added_tokens_ids = set(self.tokenizer.get_added_vocab().values()) + + self.added_tokens_list = [] + self.added_tokens_dict = dict() + self.added_tokens_ids = set() + + for tok, tokidx in self.tokenizer.get_added_vocab().items(): + if tokidx >= params.n_vocab or toksize < self.tokenizer.vocab_size: + continue + + self.added_tokens_list.append(tok) + self.added_tokens_dict[tok] = tokidx + self.added_tokens_ids.add(tokidx) self.unk_token_id = self.tokenizer.unk_token_id self.specials = { @@ -321,8 +329,8 @@ def __init__(self, fname_tokenizer: Path) -> None: } print(self.specials) self.special_ids = set(self.tokenizer.all_special_ids) - self.vocab_size_base: int = len(vocab_set) - self.vocab_size: int = len(vocab_set) + self.vocab_size_base: int = self.tokenizer.vocab_size + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer vocab_file = "tokenizer.model" @@ -374,7 +382,6 @@ def get_token_score(self, token_id): def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: - if text in self.specials: toktype = self.get_token_type(self.specials[text]) @@ -1095,14 +1102,14 @@ def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool: return path -def load_vocab(path: Path) -> Vocab: +def load_vocab(params: Params, path: Path) -> Vocab: # Be extra-friendly and accept either a file or a directory. Also, if it's # a directory, it might be the model directory, and tokenizer.model might # be in the parent of that. print(f"Loading vocab file '{path}'") - return VocabLoader(path) + return VocabLoader(params, path) def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: @@ -1183,7 +1190,7 @@ def main(args_in: list[str] | None = None) -> None: if not args.outfile: raise ValueError("need --outfile if using --vocab-only") # FIXME: Try to respect vocab_dir somehow? - vocab = load_vocab(args.vocab_dir or args.model) + vocab = load_vocab(params, args.vocab_dir or args.model) special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = True, n_vocab = vocab.vocab_size) @@ -1197,7 +1204,7 @@ def main(args_in: list[str] | None = None) -> None: vocab = model_plus.vocab else: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent - vocab = load_vocab(vocab_dir) + vocab = load_vocab(params, vocab_dir) # FIXME: Try to respect vocab_dir somehow? print(f"Vocab info: {vocab}") From 2e263ca2003529943c97699bd53c6805f4435736 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 19 Nov 2023 10:20:06 +0900 Subject: [PATCH 24/30] update vocab class --- convert.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/convert.py b/convert.py index ae47e198d3b65..6174e629bb587 100755 --- a/convert.py +++ b/convert.py @@ -307,15 +307,19 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: "You can install it with `pip install transformers`." ) from e - self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) - vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} + try: + self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True) + vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()} + except: + self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True) + vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()} self.added_tokens_list = [] self.added_tokens_dict = dict() self.added_tokens_ids = set() - for tok, tokidx in self.tokenizer.get_added_vocab().items(): - if tokidx >= params.n_vocab or toksize < self.tokenizer.vocab_size: + for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]): + if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size: continue self.added_tokens_list.append(tok) @@ -324,7 +328,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: self.unk_token_id = self.tokenizer.unk_token_id self.specials = { - tok: self.tokenizer.vocab[tok] + tok: self.tokenizer.get_vocab()[tok] for tok in self.tokenizer.all_special_tokens } print(self.specials) @@ -343,7 +347,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()} special_ids = set(tokenizer.all_special_ids) for i in range(self.vocab_size_base): From 5ac1949fff740425d32a91066551208c8fde0d05 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Wed, 22 Nov 2023 19:54:04 +0900 Subject: [PATCH 25/30] change funtion name --- convert.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/convert.py b/convert.py index 6174e629bb587..20e2be5c2d639 100755 --- a/convert.py +++ b/convert.py @@ -338,7 +338,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: self.fname_tokenizer = fname_tokenizer vocab_file = "tokenizer.model" - path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) + path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: self.spm = SentencePieceProcessor(str(path_candidate)) print(self.spm.vocab_size(), self.vocab_size_base) @@ -407,19 +407,19 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def get_vocab_type(self) -> str: path_candidates = [] vocab_file = "tokenizer.model" - path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) + path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: return "llama" path_candidates.append(path_candidate) vocab_file = "vocab.json" - path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) + path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: return "gpt2" path_candidates.append(path_candidate) vocab_file = "tokenizer.json" - path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) + path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate: if not self.has_newline_token(): return "gpt2" @@ -1091,7 +1091,7 @@ def load_some_model(path: Path) -> ModelPlus: return model_plus -def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool: +def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]: path2 = path / vocab_file # Use `.parent` instead of /.. to handle the symlink case better. path3 = path.parent / vocab_file From 61edd1bc5999480e71f1e3121ea54e33288ce519 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Tue, 28 Nov 2023 16:23:27 +0900 Subject: [PATCH 26/30] Remove unused variable/functions, add types to class variable and methods, delete blank liens --- convert.py | 80 ++++++++++++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/convert.py b/convert.py index 36f38472dae8b..65ba4334507d8 100644 --- a/convert.py +++ b/convert.py @@ -18,6 +18,7 @@ import time import zipfile from abc import ABCMeta, abstractmethod +from collections import OrderedDict from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path @@ -313,30 +314,25 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: except: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True) vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()} - - self.added_tokens_list = [] - self.added_tokens_dict = dict() - self.added_tokens_ids = set() - + + self.added_tokens_dict: OrderedDict[str, int] = OrderedDict() + for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]): if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size: continue - - self.added_tokens_list.append(tok) + self.added_tokens_dict[tok] = tokidx - self.added_tokens_ids.add(tokidx) - - self.unk_token_id = self.tokenizer.unk_token_id - self.specials = { + + self.unk_token_id: int = self.tokenizer.unk_token_id + self.specials: dict[str, int] = { tok: self.tokenizer.get_vocab()[tok] for tok in self.tokenizer.all_special_tokens } - print(self.specials) - self.special_ids = set(self.tokenizer.all_special_ids) + self.special_ids: set[int] = set(self.tokenizer.all_special_ids) self.vocab_size_base: int = self.tokenizer.vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict) + self.fname_tokenizer: str = fname_tokenizer + vocab_file = "tokenizer.model" path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: @@ -348,18 +344,18 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.tokenizer reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()} - special_ids = set(tokenizer.all_special_ids) - + added_tokens_ids = set(self.added_tokens_dict.values()) + for i in range(self.vocab_size_base): - if i in self.added_tokens_ids: + if i in added_tokens_ids: continue - + text = reverse_vocab[i].encode("utf-8") yield text, self.get_token_score(i), self.get_token_type(i) - def get_token_type(self, token_id): + def get_token_type(self, token_id: int) -> gguf.TokenType: toktype = gguf.TokenType.NORMAL - + if self.spm is not None and token_id < self.spm.vocab_size(): if self.spm.is_unknown(token_id): toktype = gguf.TokenType.UNKNOWN @@ -377,27 +373,27 @@ def get_token_type(self, token_id): return toktype - def get_token_score(self, token_id): + def get_token_score(self, token_id: int) -> float: if self.spm is not None and token_id < self.spm.vocab_size(): return self.spm.get_score(token_id) else: return 0.0 def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - - for text in self.added_tokens_list: + + for text in self.added_tokens_dict: if text in self.specials: - + toktype = self.get_token_type(self.specials[text]) score = self.get_token_score(self.specials[text]) - + else: toktype = gguf.TokenType.USER_DEFINED score = -1000.0 yield text.encode("utf-8"), score, toktype - def has_newline_token(self): + def has_newline_token(self) -> bool: return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -421,7 +417,7 @@ def get_vocab_type(self) -> str: vocab_file = "tokenizer.json" path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate: - if not self.has_newline_token(): + if not self.has_newline_token(): return "gpt2" else: return "llama" @@ -432,7 +428,7 @@ def get_vocab_type(self) -> str: "if it's in another directory, pass the directory as --vocab-dir") def __repr__(self) -> str: - return f"" + return f"" Vocab: TypeAlias = 'VocabLoader' @@ -814,15 +810,15 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N if params.n_vocab != vocab.vocab_size: if params.n_vocab == vocab.vocab_size: print("Ignoring added_tokens.json since model matches vocab size without it.") - vocab.added_tokens_list = [] + vocab.added_tokens_dict = OrderedDict() vocab.vocab_size = vocab.vocab_size return - + if pad_vocab and params.n_vocab > vocab.vocab_size: pad_count = params.n_vocab - vocab.vocab_size print(f'Padding vocab with {pad_count} token(s) - through ') for i in range(1, (params.n_vocab - vocab.vocab_size) + 1): - vocab.added_tokens_list.append(f'') + vocab.added_tokens_dict[f''] = -1 vocab.vocab_size = params.n_vocab return msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" @@ -1112,25 +1108,15 @@ def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]: path2 = path / vocab_file # Use `.parent` instead of /.. to handle the symlink case better. path3 = path.parent / vocab_file - + if path2.exists(): path = path2 elif path3.exists(): path = path3 else: path = None - - return path - -def load_vocab(params: Params, path: Path) -> Vocab: - # Be extra-friendly and accept either a file or a directory. Also, if it's - # a directory, it might be the model directory, and tokenizer.model might - # be in the parent of that. - - print(f"Loading vocab file '{path}'") - - return VocabLoader(params, path) + return path def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: @@ -1215,7 +1201,7 @@ def main(args_in: list[str] | None = None) -> None: if not args.outfile: raise ValueError("need --outfile if using --vocab-only") # FIXME: Try to respect vocab_dir somehow? - vocab = load_vocab(params, args.vocab_dir or args.model) + vocab = VocabLoader(params, args.vocab_dir or args.model) special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = True, n_vocab = vocab.vocab_size) @@ -1229,7 +1215,7 @@ def main(args_in: list[str] | None = None) -> None: vocab = model_plus.vocab else: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent - vocab = load_vocab(params, vocab_dir) + vocab = VocabLoader(params, vocab_dir) # FIXME: Try to respect vocab_dir somehow? print(f"Vocab info: {vocab}") From 1f5357cbcf3be28a66883c42f9c3521597c070fa Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Tue, 28 Nov 2023 16:46:54 +0900 Subject: [PATCH 27/30] fix flake8 warnings --- convert.py | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/convert.py b/convert.py index 65ba4334507d8..4f38ece0f4c88 100644 --- a/convert.py +++ b/convert.py @@ -22,7 +22,7 @@ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar +from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar, Optional import numpy as np from sentencepiece import SentencePieceProcessor @@ -310,10 +310,8 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: try: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True) - vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()} - except: + except Exception: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True) - vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()} self.added_tokens_dict: OrderedDict[str, int] = OrderedDict() @@ -423,15 +421,15 @@ def get_vocab_type(self) -> str: return "llama" path_candidates.append(path_candidate) - raise FileNotFoundError( - f"Could not find {find_candidates} in {path} or its parent; " - "if it's in another directory, pass the directory as --vocab-dir") + raise FileNotFoundError(f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; if it's in another directory, pass the directory as --vocab-dir") def __repr__(self) -> str: return f"" + Vocab: TypeAlias = 'VocabLoader' + # # data loading # TODO: reuse (probably move to gguf.py?) @@ -806,6 +804,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc break yield result + def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: if params.n_vocab != vocab.vocab_size: if params.n_vocab == vocab.vocab_size: @@ -907,11 +906,10 @@ def close(self) -> None: self.gguf.close() @staticmethod - def write_vocab_only( - fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, - ) -> None: + def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, + svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False) -> None: check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -939,13 +937,11 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: return dt.quantize(arr) @staticmethod - def write_all( - fname_out : Path, ftype: GGMLFileType, params: Params, - model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, - endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab : bool = False, - ) -> None: + def write_all(fname_out : Path, ftype: GGMLFileType, params: Params, + model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, + endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab : bool = False) -> None: check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1207,7 +1203,7 @@ def main(args_in: list[str] | None = None) -> None: n_vocab = vocab.vocab_size) outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, - endianess = endianess, pad_vocab = args.padvocab) + endianess = endianess, pad_vocab = args.padvocab) print(f"Wrote {outfile}") return @@ -1234,7 +1230,7 @@ def main(args_in: list[str] | None = None) -> None: print(f"Writing {outfile}, format {ftype}") OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab) + concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab) print(f"Wrote {outfile}") From 8fabb0132cf53fe9c4fe18bbbe3c1aa8d09b793c Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Wed, 13 Dec 2023 13:03:24 -0500 Subject: [PATCH 28/30] code style cleanup --- convert.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/convert.py b/convert.py index 4f38ece0f4c88..6eab0ee34602a 100644 --- a/convert.py +++ b/convert.py @@ -10,6 +10,7 @@ import json import math import mmap +import os import pickle import re import signal @@ -22,12 +23,11 @@ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar, Optional +from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast import numpy as np from sentencepiece import SentencePieceProcessor -import os if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf @@ -417,11 +417,13 @@ def get_vocab_type(self) -> str: if path_candidate: if not self.has_newline_token(): return "gpt2" - else: - return "llama" + return "llama" path_candidates.append(path_candidate) - raise FileNotFoundError(f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; if it's in another directory, pass the directory as --vocab-dir") + raise FileNotFoundError( + f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; " + "if it's in another directory, pass the directory as --vocab-dir" + ) def __repr__(self) -> str: return f"" @@ -906,10 +908,11 @@ def close(self) -> None: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, - svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False) -> None: + def write_vocab_only( + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, + ) -> None: check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -937,11 +940,12 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: return dt.quantize(arr) @staticmethod - def write_all(fname_out : Path, ftype: GGMLFileType, params: Params, - model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, - endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab : bool = False) -> None: + def write_all( + fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, + ) -> None: check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) From c3b1c12fdd454cf9a78eb7cb7dc83508319b56d7 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Wed, 13 Dec 2023 13:03:57 -0500 Subject: [PATCH 29/30] make mypy happy --- convert.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/convert.py b/convert.py index 6eab0ee34602a..49fa3fc168087 100644 --- a/convert.py +++ b/convert.py @@ -329,7 +329,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: self.special_ids: set[int] = set(self.tokenizer.all_special_ids) self.vocab_size_base: int = self.tokenizer.vocab_size self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict) - self.fname_tokenizer: str = fname_tokenizer + self.fname_tokenizer: Path = fname_tokenizer vocab_file = "tokenizer.model" path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) @@ -373,9 +373,8 @@ def get_token_type(self, token_id: int) -> gguf.TokenType: def get_token_score(self, token_id: int) -> float: if self.spm is not None and token_id < self.spm.vocab_size(): - return self.spm.get_score(token_id) - else: - return 0.0 + return cast(float, self.spm.get_score(token_id)) + return 0.0 def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -1110,13 +1109,11 @@ def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]: path3 = path.parent / vocab_file if path2.exists(): - path = path2 - elif path3.exists(): - path = path3 - else: - path = None + return path2 + if path3.exists(): + return path3 - return path + return None def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: From 35e95b6266a0d0fb4fccfb63db3a8eeb1833d283 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Thu, 14 Dec 2023 08:33:10 +0900 Subject: [PATCH 30/30] change exception --- convert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/convert.py b/convert.py index 49fa3fc168087..9f7ab060a919d 100644 --- a/convert.py +++ b/convert.py @@ -310,7 +310,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None: try: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True) - except Exception: + except ValueError: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True) self.added_tokens_dict: OrderedDict[str, int] = OrderedDict() @@ -400,25 +400,25 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def get_vocab_type(self) -> str: path_candidates = [] vocab_file = "tokenizer.model" + path_candidates.append(vocab_file) path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: return "llama" - path_candidates.append(path_candidate) vocab_file = "vocab.json" + path_candidates.append(vocab_file) path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate is not None: return "gpt2" - path_candidates.append(path_candidate) vocab_file = "tokenizer.json" + path_candidates.append(vocab_file) path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) if path_candidate: if not self.has_newline_token(): return "gpt2" return "llama" - path_candidates.append(path_candidate) raise FileNotFoundError( f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; " "if it's in another directory, pass the directory as --vocab-dir"