From f7e377d62f20d92a05b4c95647efeff4c2976c18 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 09:32:22 +0000
Subject: [PATCH 01/30] Add HFVocab into convert.py

---
 convert.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 4 deletions(-)
diff --git a/convert.py b/convert.py
index e9b08d344f5bd..d6e6558c25c08 100755
--- a/convert.py
+++ b/convert.py
@@ -414,7 +414,54 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
     def __repr__(self) -> str:
         return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
-Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
+class HFVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
+        
+        added_tokens: dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        else:
+            added_tokens = {}
+
+        vocab_size: int = self.tokenizer.vocab_size
+        
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.tokenizer
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+        byte_encoder = bytes_to_unicode()
+        byte_decoder = {v: k for k, v in byte_encoder.items()}
+        print(len(byte_decoder), byte_decoder)
+        
+        for i in range(tokenizer.vocab_size):
+            text = reverse_vocab[i].encode("utf-8")
+            yield text, 0.0, gguf.TokenType.NORMAL
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.hf_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<HFVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab | HFVocab'
 
 #
 # data loading
@@ -1084,6 +1131,8 @@ def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
         return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
     elif vocabtype == "spm":
         return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    elif vocabtype == "hf":
+        return HFVocab(path, added_tokens_path if added_tokens_path.exists() else None)
     else:
         raise ValueError(f"Unsupported vocabulary type {vocabtype}")
 
@@ -1120,7 +1169,7 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
     parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
+    parser.add_argument("--vocabtype",   choices=["spm", "bpe", "hf"], help="vocab format (default: spm)", default="spm")
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
     args = parser.parse_args(args_in)
@@ -1162,7 +1211,7 @@ def main(args_in: list[str] | None = None) -> None:
         assert args.outfile, "need --outfile if using --vocab-only"
         # FIXME: Try to respect vocab_dir somehow?
         vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype in ('bpe', 'hf'))
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
         print(f"Wrote {outfile}")
@@ -1174,7 +1223,7 @@ def main(args_in: list[str] | None = None) -> None:
         vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
         vocab = load_vocab(vocab_dir, args.vocabtype)
     # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype in ('bpe', 'hf'))
 
     model   = model_plus.model
     model   = convert_model_names(model, params)

From b0e00cb868b414cd135fc8a636343b2a583bc87e Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 09:38:58 +0000
Subject: [PATCH 02/30] Update convert.py

---
 convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index d6e6558c25c08..53204f21cdad4 100755
--- a/convert.py
+++ b/convert.py
@@ -1108,7 +1108,7 @@ def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
     # Be extra-friendly and accept either a file or a directory.  Also, if it's
     # a directory, it might be the model directory, and tokenizer.model might
     # be in the parent of that.
-    if path.is_dir():
+    if path.is_dir() and vocabtype != 'hf':
         vocab_file = "tokenizer.model"
         if vocabtype == 'bpe':
             vocab_file = "vocab.json"

From f888d2ea13d6f2838b5f7f74118c8fd7de14e0b9 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 09:40:14 +0000
Subject: [PATCH 03/30] Update convert.py

---
 convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/convert.py b/convert.py
index 53204f21cdad4..34f380cd70eee 100755
--- a/convert.py
+++ b/convert.py
@@ -27,6 +27,7 @@
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]
+from transformers import AutoTokenizer
 
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:

From ea9f35f082c3ff377055af462a4f5e94866240e7 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 09:42:03 +0000
Subject: [PATCH 04/30] add bytes_to_unicode function

---
 convert.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/convert.py b/convert.py
index 34f380cd70eee..98ea6d9c8d2c7 100755
--- a/convert.py
+++ b/convert.py
@@ -301,6 +301,27 @@ def load(model_plus: ModelPlus) -> Params:
 #
 # vocab
 #
+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    return dict(zip(bs, (chr(n) for n in cs)))
+
 
 class BpeVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:

From c7b636e9507a9ae7dd81242004ab1c9497ea942a Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 09:43:22 +0000
Subject: [PATCH 05/30] change add_meta_vocab fucntion

---
 convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert.py b/convert.py
index 98ea6d9c8d2c7..e84db460f9c20 100755
--- a/convert.py
+++ b/convert.py
@@ -913,12 +913,12 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
             scores.append(score)
             toktypes.append(toktype)
 
-        if isinstance(vocab, SentencePieceVocab):
+        if isinstance(vocab, SentencePieceVocab) or isinstance(vocab, HFVocab):
             self.gguf.add_tokenizer_model("llama")
         elif isinstance(vocab, BpeVocab):
             self.gguf.add_tokenizer_model("gpt2")
         else:
-            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
+            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab or HFVocab')
         self.gguf.add_token_list(tokens)
         self.gguf.add_token_scores(scores)
         self.gguf.add_token_types(toktypes)

From 6ec856b3efab650c74e659db7393efee7c184c5e Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 09:44:07 +0000
Subject: [PATCH 06/30] remove debug code

---
 convert.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/convert.py b/convert.py
index e84db460f9c20..4868b66f9fe88 100755
--- a/convert.py
+++ b/convert.py
@@ -465,7 +465,6 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
         byte_encoder = bytes_to_unicode()
         byte_decoder = {v: k for k, v in byte_encoder.items()}
-        print(len(byte_decoder), byte_decoder)
         
         for i in range(tokenizer.vocab_size):
             text = reverse_vocab[i].encode("utf-8")

From 1f16e5f234d9466d5443e7cc507247e300de16c0 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 09:46:48 +0000
Subject: [PATCH 07/30] remove byte_encoder

---
 convert.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/convert.py b/convert.py
index 4868b66f9fe88..955b6546fc496 100755
--- a/convert.py
+++ b/convert.py
@@ -301,28 +301,6 @@ def load(model_plus: ModelPlus) -> Params:
 #
 # vocab
 #
-def bytes_to_unicode():
-    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    return dict(zip(bs, (chr(n) for n in cs)))
-
-
 class BpeVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
         self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
@@ -463,8 +441,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-        byte_encoder = bytes_to_unicode()
-        byte_decoder = {v: k for k, v in byte_encoder.items()}
         
         for i in range(tokenizer.vocab_size):
             text = reverse_vocab[i].encode("utf-8")

From e876aec1893313c3c164f2621c2cd5a6a5d78720 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 10:37:41 +0000
Subject: [PATCH 08/30] Add newline between classes

---
 convert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/convert.py b/convert.py
index 955b6546fc496..4077be7f2b117 100755
--- a/convert.py
+++ b/convert.py
@@ -414,6 +414,7 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
     def __repr__(self) -> str:
         return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
+
 class HFVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))

From 177845089f03ed5bd528ecd307dee6d4bcbbbdeb Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 15 Oct 2023 11:34:25 +0000
Subject: [PATCH 09/30] Check tokenizer.json when tokenizer.model is not exist.

---
 convert.py | 54 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/convert.py b/convert.py
index 4077be7f2b117..8b3e929235c7e 100755
--- a/convert.py
+++ b/convert.py
@@ -1102,25 +1102,51 @@ def load_some_model(path: Path) -> ModelPlus:
     return model_plus
 
 
+def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool:
+    path2 = path / vocab_file
+    # Use `.parent` instead of /.. to handle the symlink case better.
+    path3 = path.parent / vocab_file
+    
+    if path2.exists():
+        path = path2
+    elif path3.exists():
+        path = path3
+    else:
+        path = None
+        
+    return path
+    
+    
+
 def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
     # Be extra-friendly and accept either a file or a directory.  Also, if it's
     # a directory, it might be the model directory, and tokenizer.model might
     # be in the parent of that.
-    if path.is_dir() and vocabtype != 'hf':
+    if path.is_dir():
+        find_candidates = []
+        
         vocab_file = "tokenizer.model"
-        if vocabtype == 'bpe':
+        if vocabtype == "bpe":
             vocab_file = "vocab.json"
-        path2 = path / vocab_file
-        # Use `.parent` instead of /.. to handle the symlink case better.
-        path3 = path.parent / vocab_file
-        if path2.exists():
-            path = path2
-        elif path3.exists():
-            path = path3
+
+        path_candidate = vocab_check_and_append_path(path, vocab_file)
+        find_candidates.append(vocab_file)
+        
+        if path_candidate is None:
+            vocab_file = "tokenizer.json"
+            hf_path = vocab_check_and_append_path(path, vocab_file)
+            find_candidates.append(vocab_file)
+            
+            if vocabtype == "spm" and hf_path is not None:
+                # A case where there is no tokenizer.model but there is a tokenizer.json and it needs to be loaded into HFVocab.
+                vocabtype = "hf"
+            else:
+                raise FileNotFoundError(
+                    f"Could not find {find_candidates} in {path} or its parent; "
+                    "if it's in another directory, pass the directory as --vocab-dir")
         else:
-            raise FileNotFoundError(
-                f"Could not find {vocab_file} in {path} or its parent; "
-                "if it's in another directory, pass the directory as --vocab-dir")
+            path = path_candidate
+            
 
     print(f"Loading vocab file '{path}', type '{vocabtype}'")
 
@@ -1209,7 +1235,7 @@ def main(args_in: list[str] | None = None) -> None:
         assert args.outfile, "need --outfile if using --vocab-only"
         # FIXME: Try to respect vocab_dir somehow?
         vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype in ('bpe', 'hf'))
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab))
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
         print(f"Wrote {outfile}")
@@ -1221,7 +1247,7 @@ def main(args_in: list[str] | None = None) -> None:
         vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
         vocab = load_vocab(vocab_dir, args.vocabtype)
     # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype in ('bpe', 'hf'))
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab))
 
     model   = model_plus.model
     model   = convert_model_names(model, params)

From a5b26b660bb92f8f2f206179b9e98dab814f9836 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Wed, 18 Oct 2023 01:01:50 +0000
Subject: [PATCH 10/30] Move transformers dependency to local code

---
 convert.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/convert.py b/convert.py
index 8b3e929235c7e..c742a1a00ec7b 100755
--- a/convert.py
+++ b/convert.py
@@ -27,7 +27,6 @@
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]
-from transformers import AutoTokenizer
 
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
@@ -417,6 +416,14 @@ def __repr__(self) -> str:
 
 class HFVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        try:
+            from transformers import AutoTokenizer
+        except ModuleNotFoundError:
+            raise ImportError(
+                "To use HFVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            )
+
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
         
         added_tokens: dict[str, int]
@@ -438,7 +445,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
         self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
         self.fname_tokenizer = fname_tokenizer
         self.fname_added_tokens = fname_added_tokens
-
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

From 5a1f178091bf0d0be985f91d4a7f520ef156a122 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Wed, 18 Oct 2023 01:30:38 +0000
Subject: [PATCH 11/30] Add error context with 'raise from'

---
 convert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/convert.py b/convert.py
index c742a1a00ec7b..28de7774c417e 100755
--- a/convert.py
+++ b/convert.py
@@ -418,11 +418,11 @@ class HFVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
         try:
             from transformers import AutoTokenizer
-        except ModuleNotFoundError:
+        except ImportError as e:
             raise ImportError(
                 "To use HFVocab, please install the `transformers` package. "
                 "You can install it with `pip install transformers`."
-            )
+            ) from e
 
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
         
@@ -445,6 +445,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
         self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
         self.fname_tokenizer = fname_tokenizer
         self.fname_added_tokens = fname_added_tokens
+
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}

From 89611cb05a6ff6da1ad0e1fbd346f504e9dc999f Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Mon, 23 Oct 2023 04:15:43 +0000
Subject: [PATCH 12/30] Add fast tokenizer option to BpeVocab

---
 convert.py | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/convert.py b/convert.py
index 28de7774c417e..b31fb4057e311 100755
--- a/convert.py
+++ b/convert.py
@@ -302,23 +302,35 @@ def load(model_plus: ModelPlus) -> Params:
 #
 class BpeVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+        fast_tokenizer = fname_tokenizer.name == 'tokenizer.json'
+        tokenizer_json = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+        
+        if fast_tokenizer:
+            self.bpe_tokenizer = tokenizer_json['model']['vocab']
+        else:
+            self.bpe_tokenizer = tokenizer_json
+
         added_tokens: dict[str, int]
         if fname_added_tokens is not None:
-            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
             added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
         else:
-            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
-            if not tokenizer_json_file.is_file():
-                added_tokens = {}
-            else:
-                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
-                added_tokens = dict(
-                    (item['content'], item['id'])
-                    for item in tokenizer_json.get('added_tokens', [])
-                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in self.bpe_tokenizer )
+            if not fast_tokenizer:
+                tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
+                
+                if not tokenizer_json_file.is_file():
+                    added_tokens = {}
+                else:
+                    tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
+
+            added_tokens = dict(
+                (item['content'], item['id'])
+                for item in tokenizer_json.get('added_tokens', []))
+
+        added_tokens = dict(
+        (token_content, token_id)
+        for token_content, token_id in added_tokens.items()
+        # Added tokens here can be duplicates of the main vocabulary.
+        if token_content not in self.bpe_tokenizer)
 
         vocab_size: int = len(self.bpe_tokenizer)
         expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))

From e71544231c53e679fb7471129b22f783a8a159c1 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 29 Oct 2023 18:29:38 +0000
Subject: [PATCH 13/30] Update convert.py

---
 convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index 6a162c4d66379..77a04062d67f8 100755
--- a/convert.py
+++ b/convert.py
@@ -1159,7 +1159,7 @@ def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
             hf_path = vocab_check_and_append_path(path, vocab_file)
             find_candidates.append(vocab_file)
             
-            if vocabtype == "spm" and hf_path is not None:
+            if hf_path is not None:
                 # A case where there is no tokenizer.model but there is a tokenizer.json and it needs to be loaded into HFVocab.
                 vocabtype = "hf"
             else:

From d54764d0b1d2f4a1e3fe6736783afff1bd81a20d Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Mon, 30 Oct 2023 11:54:59 +0000
Subject: [PATCH 14/30] Add VocabLoader and remove *Vocab class

---
 convert.py | 246 +++++++++++------------------------------------------
 1 file changed, 50 insertions(+), 196 deletions(-)

diff --git a/convert.py b/convert.py
index 77a04062d67f8..a61853afe894b 100755
--- a/convert.py
+++ b/convert.py
@@ -297,155 +297,24 @@ def load(model_plus: ModelPlus) -> Params:
         return params
 
 
-#
-# vocab
-#
-class BpeVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        fast_tokenizer = fname_tokenizer.name == 'tokenizer.json'
-        tokenizer_json = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
-        
-        if fast_tokenizer:
-            self.bpe_tokenizer = tokenizer_json['model']['vocab']
-        else:
-            self.bpe_tokenizer = tokenizer_json
-
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            if not fast_tokenizer:
-                tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
-                
-                if not tokenizer_json_file.is_file():
-                    added_tokens = {}
-                else:
-                    tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
-
-            added_tokens = dict(
-                (item['content'], item['id'])
-                for item in tokenizer_json.get('added_tokens', []))
-
-        added_tokens = dict(
-        (token_content, token_id)
-        for token_content, token_id in added_tokens.items()
-        # Added tokens here can be duplicates of the main vocabulary.
-        if token_content not in self.bpe_tokenizer)
-
-        vocab_size: int = len(self.bpe_tokenizer)
-        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids      = sorted(added_tokens.values())
-        if expected_ids != actual_ids:
-            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
-
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list    = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer      = fname_tokenizer
-        self.fname_added_tokens   = fname_added_tokens
-
-    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        tokenizer = self.bpe_tokenizer
-        from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
-        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
-
-        for i, _ in enumerate(tokenizer):
-            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
-
-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
-
-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.bpe_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            added_tokens = {}
-
-        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
-
-        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
-        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids   = sorted(new_tokens.keys())
-
-        if expected_new_ids != actual_new_ids:
-            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
-
-        # Token pieces that were added to the base vocabulary.
-        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base    = vocab_size
-        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer    = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
-
-    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        tokenizer = self.sentencepiece_tokenizer
-        for i in range(tokenizer.vocab_size()):
-            piece = tokenizer.id_to_piece(i)
-            text: bytes = piece.encode("utf-8")
-            score: float = tokenizer.get_score(i)
-
-            toktype = gguf.TokenType.NORMAL
-            if tokenizer.is_unknown(i):
-                toktype = gguf.TokenType.UNKNOWN
-            if tokenizer.is_control(i):
-                toktype = gguf.TokenType.CONTROL
-
-            # NOTE: I think added_tokens are user defined.
-            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
-
-            if tokenizer.is_unused(i):
-                toktype = gguf.TokenType.UNUSED
-            if tokenizer.is_byte(i):
-                toktype = gguf.TokenType.BYTE
-
-            yield text, score, toktype
-
-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
-
-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.sentencepiece_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-class HFVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+class VocabLoader:
+    def __init__(self, fname_tokenizer: Path) -> None:
         try:
             from transformers import AutoTokenizer
         except ImportError as e:
             raise ImportError(
-                "To use HFVocab, please install the `transformers` package. "
+                "To use VocabLoader, please install the `transformers` package. "
                 "You can install it with `pip install transformers`."
             ) from e
 
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
+        vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
         
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            added_tokens = {}
+        added_tokens = {
+            token: tid
+            for token, tid in self.tokenizer.get_added_vocab().items()
+            if token not in vocab_set
+        }
 
         vocab_size: int = self.tokenizer.vocab_size
         
@@ -459,7 +328,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
         self.vocab_size_base: int = vocab_size
         self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
         self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
 
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
@@ -478,10 +346,34 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         yield from self.hf_tokens()
         yield from self.added_tokens()
 
+    def get_vocab_type(self) -> str:
+        path_candidates = []
+        vocab_file = "tokenizer.model"
+        path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
+        if path_candidate is not None:
+            return "llama"
+            
+        path_candidates.append(path_candidate)
+        vocab_file = "vocab.json"
+        path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
+        if path_candidate is not None:
+            return "gpt2"
+            
+        path_candidates.append(path_candidate)
+        vocab_file = "tokenizer.json"
+        path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
+        if path_candidate:
+            return "llama"
+            
+        path_candidates.append(path_candidate)
+        raise FileNotFoundError(
+                    f"Could not find {find_candidates} in {path} or its parent; "
+                    "if it's in another directory, pass the directory as --vocab-dir")
+
     def __repr__(self) -> str:
-        return f"<HFVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
-Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab | HFVocab'
+Vocab: TypeAlias = 'VocabLoader'
 
 #
 # data loading
@@ -854,17 +746,14 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
 
 def check_vocab_size(params: Params, vocab: Vocab) -> None:
     if params.n_vocab != vocab.vocab_size:
-        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
         if params.n_vocab == vocab.vocab_size_base:
             print("Ignoring added_tokens.json since model matches vocab size without it.")
             vocab.added_tokens_list = []
             vocab.vocab_size = vocab.vocab_size_base
             return
         msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
-        if vocab.fname_added_tokens is not None:
-            msg += f" combined with {vocab.fname_added_tokens}"
         msg += f" has {vocab.vocab_size})."
-        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
             msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
         raise Exception(msg)
 
@@ -911,12 +800,9 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
             scores.append(score)
             toktypes.append(toktype)
 
-        if isinstance(vocab, SentencePieceVocab) or isinstance(vocab, HFVocab):
-            self.gguf.add_tokenizer_model("llama")
-        elif isinstance(vocab, BpeVocab):
-            self.gguf.add_tokenizer_model("gpt2")
-        else:
-            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab or HFVocab')
+        vocab_type = vocab.get_vocab_type()
+        self.gguf.add_tokenizer_model(vocab_type)
+        
         self.gguf.add_token_list(tokens)
         self.gguf.add_token_scores(scores)
         self.gguf.add_token_types(toktypes)
@@ -1137,50 +1023,16 @@ def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool:
         path = None
         
     return path
-    
-    
 
-def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
+
+def load_vocab(path: Path) -> Vocab:
     # Be extra-friendly and accept either a file or a directory.  Also, if it's
     # a directory, it might be the model directory, and tokenizer.model might
     # be in the parent of that.
-    if path.is_dir():
-        find_candidates = []
-        
-        vocab_file = "tokenizer.model"
-        if vocabtype == "bpe":
-            vocab_file = "vocab.json"
-
-        path_candidate = vocab_check_and_append_path(path, vocab_file)
-        find_candidates.append(vocab_file)
-        
-        if path_candidate is None:
-            vocab_file = "tokenizer.json"
-            hf_path = vocab_check_and_append_path(path, vocab_file)
-            find_candidates.append(vocab_file)
-            
-            if hf_path is not None:
-                # A case where there is no tokenizer.model but there is a tokenizer.json and it needs to be loaded into HFVocab.
-                vocabtype = "hf"
-            else:
-                raise FileNotFoundError(
-                    f"Could not find {find_candidates} in {path} or its parent; "
-                    "if it's in another directory, pass the directory as --vocab-dir")
-        else:
-            path = path_candidate
             
+    print(f"Loading vocab file '{path}'")
 
-    print(f"Loading vocab file '{path}', type '{vocabtype}'")
-
-    added_tokens_path = path.parent / "added_tokens.json"
-    if vocabtype == "bpe":
-        return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
-    elif vocabtype == "spm":
-        return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
-    elif vocabtype == "hf":
-        return HFVocab(path, added_tokens_path if added_tokens_path.exists() else None)
-    else:
-        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+    return VocabLoader(path)
 
 
 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@@ -1215,7 +1067,6 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
     parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--vocabtype",   choices=["spm", "bpe", "hf"], help="vocab format (default: spm)", default="spm")
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
     parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
@@ -1261,9 +1112,9 @@ def main(args_in: list[str] | None = None) -> None:
         if not args.outfile:
             raise ValueError("need --outfile if using --vocab-only")
         # FIXME: Try to respect vocab_dir somehow?
-        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
+        vocab = load_vocab(args.vocab_dir or args.model)
         special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-                                          load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab),
+                                          load_merges = True,
                                           n_vocab = vocab.vocab_size)
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
@@ -1274,12 +1125,15 @@ def main(args_in: list[str] | None = None) -> None:
         vocab = model_plus.vocab
     else:
         vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-        vocab = load_vocab(vocab_dir, args.vocabtype)
+        vocab = load_vocab(vocab_dir)
+
     # FIXME: Try to respect vocab_dir somehow?
+    print(f"Vocab info: {vocab}")
     special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-                                      load_merges = isinstance(vocab, BpeVocab) or isinstance(vocab, HFVocab),
+                                      load_merges = True,
                                       n_vocab = vocab.vocab_size)
 
+    print(f"Special vocab info: {special_vocab}")
     model   = model_plus.model
     model   = convert_model_names(model, params)
     ftype   = pick_output_type(model, args.outtype)

From e19b78038a8fdef128006433118ce374f2e97b43 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Mon, 30 Oct 2023 11:56:49 +0000
Subject: [PATCH 15/30] Add transformers dependency

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 81c909d0ba7fe..badfec3be804c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 numpy==1.24.4
 sentencepiece==0.1.98
+transformers>=4.34.0
 gguf>=0.1.0

From 28f09beb60e56c5e1eea68223cd809da57d3c02d Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 5 Nov 2023 20:22:25 +0900
Subject: [PATCH 16/30] remove added tokens and check newline token to decide
 spm or bpe

---
 convert.py | 46 +++++++++++++++-------------------------------
 1 file changed, 15 insertions(+), 31 deletions(-)

diff --git a/convert.py b/convert.py
index a61853afe894b..d4d2a5a9b74c9 100755
--- a/convert.py
+++ b/convert.py
@@ -309,42 +309,23 @@ def __init__(self, fname_tokenizer: Path) -> None:
 
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
         vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
-        
-        added_tokens = {
-            token: tid
-            for token, tid in self.tokenizer.get_added_vocab().items()
-            if token not in vocab_set
-        }
 
-        vocab_size: int = self.tokenizer.vocab_size
-        
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids   = sorted(added_tokens.values())
-        if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
-
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size: int = len(self.tokenizer.vocab)
         self.fname_tokenizer = fname_tokenizer
 
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
         
-        for i in range(tokenizer.vocab_size):
+        for i in range(self.vocab_size):
             text = reverse_vocab[i].encode("utf-8")
             yield text, 0.0, gguf.TokenType.NORMAL
-
-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+            
+    def has_newline_token(self):
+        return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
 
     def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         yield from self.hf_tokens()
-        yield from self.added_tokens()
 
     def get_vocab_type(self) -> str:
         path_candidates = []
@@ -352,26 +333,29 @@ def get_vocab_type(self) -> str:
         path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
             return "llama"
-            
+
         path_candidates.append(path_candidate)
         vocab_file = "vocab.json"
         path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
             return "gpt2"
-            
+
         path_candidates.append(path_candidate)
         vocab_file = "tokenizer.json"
         path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
         if path_candidate:
-            return "llama"
-            
+            if not self.tokenizer.can_save_slow_tokenizer(): 
+                return "gpt2"
+            else:
+                return "llama"
+
         path_candidates.append(path_candidate)
         raise FileNotFoundError(
                     f"Could not find {find_candidates} in {path} or its parent; "
                     "if it's in another directory, pass the directory as --vocab-dir")
 
     def __repr__(self) -> str:
-        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<VocabLoader with {self.vocab_size} tokens>"
 
 Vocab: TypeAlias = 'VocabLoader'
 
@@ -746,10 +730,10 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
 
 def check_vocab_size(params: Params, vocab: Vocab) -> None:
     if params.n_vocab != vocab.vocab_size:
-        if params.n_vocab == vocab.vocab_size_base:
+        if params.n_vocab == vocab.vocab_size:
             print("Ignoring added_tokens.json since model matches vocab size without it.")
             vocab.added_tokens_list = []
-            vocab.vocab_size = vocab.vocab_size_base
+            vocab.vocab_size = vocab.vocab_size
             return
         msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
         msg += f" has {vocab.vocab_size})."

From 4adb8b986217b789423a1c11b2d4ce3cb131fd06 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 5 Nov 2023 22:41:40 +0900
Subject: [PATCH 17/30] Update convert.py

---
 convert.py | 53 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/convert.py b/convert.py
index d4d2a5a9b74c9..7e43bfb01a779 100755
--- a/convert.py
+++ b/convert.py
@@ -310,6 +310,8 @@ def __init__(self, fname_tokenizer: Path) -> None:
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
         vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
 
+        self.added_tokens_list = []
+        self.vocab_size_base: int = len(self.tokenizer.vocab)
         self.vocab_size: int = len(self.tokenizer.vocab)
         self.fname_tokenizer = fname_tokenizer
 
@@ -317,15 +319,21 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
         
-        for i in range(self.vocab_size):
+        for i in range(self.vocab_size_base):
             text = reverse_vocab[i].encode("utf-8")
             yield text, 0.0, gguf.TokenType.NORMAL
-            
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
     def has_newline_token(self):
         return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
 
     def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         yield from self.hf_tokens()
+        yield from self.added_tokens()
 
     def get_vocab_type(self) -> str:
         path_candidates = []
@@ -344,7 +352,7 @@ def get_vocab_type(self) -> str:
         vocab_file = "tokenizer.json"
         path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
         if path_candidate:
-            if not self.tokenizer.can_save_slow_tokenizer(): 
+            if not self.has_newline_token(): 
                 return "gpt2"
             else:
                 return "llama"
@@ -355,7 +363,7 @@ def get_vocab_type(self) -> str:
                     "if it's in another directory, pass the directory as --vocab-dir")
 
     def __repr__(self) -> str:
-        return f"<VocabLoader with {self.vocab_size} tokens>"
+        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
 Vocab: TypeAlias = 'VocabLoader'
 
@@ -728,17 +736,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
                     break
             yield result
 
-def check_vocab_size(params: Params, vocab: Vocab) -> None:
+def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
     if params.n_vocab != vocab.vocab_size:
         if params.n_vocab == vocab.vocab_size:
             print("Ignoring added_tokens.json since model matches vocab size without it.")
             vocab.added_tokens_list = []
             vocab.vocab_size = vocab.vocab_size
             return
+            
+        if pad_vocab and params.n_vocab > vocab.vocab_size:
+            pad_count = params.n_vocab - vocab.vocab_size
+            print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
+            for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
+                vocab.added_tokens_list.append(f'<dummy{i:05}>')
+            vocab.vocab_size = params.n_vocab
+            return
         msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
         msg += f" has {vocab.vocab_size})."
         if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
             msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        if vocab.vocab_size < params.n_vocab:
+            msg += " Possibly try using the --padvocab option."
         raise Exception(msg)
 
 
@@ -812,8 +830,12 @@ def close(self) -> None:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
-        check_vocab_size(params, vocab)
+    def write_vocab_only(
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool            = False,
+        ) -> None:
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
@@ -840,8 +862,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
-        check_vocab_size(params, vocab)
+    def write_all(
+        fname_out  : Path, ftype: GGMLFileType, params: Params,
+        model      : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        concurrency: int             = DEFAULT_CONCURRENCY,
+        endianess  : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab  : bool            = False,
+        ) -> None:
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
@@ -1054,6 +1082,7 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
     parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
 
     args = parser.parse_args(args_in)
     if args.dump_single:
@@ -1101,7 +1130,8 @@ def main(args_in: list[str] | None = None) -> None:
                                           load_merges = True,
                                           n_vocab = vocab.vocab_size)
         outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
+            endianess = endianess, pad_vocab = args.padvocab)
         print(f"Wrote {outfile}")
         return
 
@@ -1127,7 +1157,8 @@ def main(args_in: list[str] | None = None) -> None:
     params.ftype = ftype
     print(f"Writing {outfile}, format {ftype}")
 
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
+        concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
     print(f"Wrote {outfile}")
 
 

From 13f07013ee71cf533b712fbc0fd167428d2a4e8e Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 5 Nov 2023 23:18:26 +0900
Subject: [PATCH 18/30] Add special token type

---
 convert.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/convert.py b/convert.py
index 7e43bfb01a779..c4eb28391bcdc 100755
--- a/convert.py
+++ b/convert.py
@@ -311,17 +311,18 @@ def __init__(self, fname_tokenizer: Path) -> None:
         vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
 
         self.added_tokens_list = []
-        self.vocab_size_base: int = len(self.tokenizer.vocab)
-        self.vocab_size: int = len(self.tokenizer.vocab)
+        self.vocab_size_base: int = len(vocab_set)
+        self.vocab_size: int = len(vocab_set)
         self.fname_tokenizer = fname_tokenizer
 
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+        special_ids = set(tokenizer.all_special_ids)
         
         for i in range(self.vocab_size_base):
             text = reverse_vocab[i].encode("utf-8")
-            yield text, 0.0, gguf.TokenType.NORMAL
+            yield text, 0.0, gguf.TokenType.NORMAL if i not in special_ids else gguf.TokenType.CONTROL
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         for text in self.added_tokens_list:

From f37a7d7028d0df1f5634984c95cd634402326244 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 12 Nov 2023 02:22:37 +0900
Subject: [PATCH 19/30] Update convert.py

---
 convert.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index c4eb28391bcdc..f72ed9ab5a0c9 100755
--- a/convert.py
+++ b/convert.py
@@ -311,9 +311,18 @@ def __init__(self, fname_tokenizer: Path) -> None:
         vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
 
         self.added_tokens_list = []
+        self.unk_token_id = self.tokenizer.unk_token_id
+        self.special_ids = set(self.tokenizer.all_special_ids)
         self.vocab_size_base: int = len(vocab_set)
         self.vocab_size: int = len(vocab_set)
         self.fname_tokenizer = fname_tokenizer
+        
+        vocab_file = "tokenizer.model"
+        path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
+        if path_candidate is not None:
+            self.spm = SentencePieceProcessor(str(path_candidate))
+        else:
+            self.spm
 
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
@@ -322,7 +331,32 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         
         for i in range(self.vocab_size_base):
             text = reverse_vocab[i].encode("utf-8")
-            yield text, 0.0, gguf.TokenType.NORMAL if i not in special_ids else gguf.TokenType.CONTROL
+            yield text, self.get_token_score(i), self.get_token_type(i)
+
+    def get_token_type(self, token_id):
+        toktype = gguf.TokenType.NORMAL
+        
+        if self.spm is None:
+            if i == self.unk_token_id:
+                toktype = gguf.TokenType.UNKNOWN
+            if i in self.special_ids:
+                toktype = gguf.TokenType.CONTROL
+        else:
+            if self.spm.is_unknown(token_id):
+                toktype = gguf.TokenType.UNKNOWN
+            if self.spm.is_control(token_id):
+                toktype = gguf.TokenType.CONTROL
+            if self.spm.is_unused(token_id):
+                toktype = gguf.TokenType.UNUSED
+            if self.spm.is_byte(token_id):
+                toktype = gguf.TokenType.BYTE
+        return toktype
+
+    def get_token_score(self, token_id):
+        if self.spm is not None:
+            return self.spm.get_score(token_id)
+        else:
+            return 0.0
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         for text in self.added_tokens_list:

From 9f4dc236a90f8af05daa20c62889805a9939d311 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 12 Nov 2023 03:23:41 +0900
Subject: [PATCH 20/30] Update convert.py

---
 convert.py | 48 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/convert.py b/convert.py
index f72ed9ab5a0c9..c79318887a86e 100755
--- a/convert.py
+++ b/convert.py
@@ -310,8 +310,16 @@ def __init__(self, fname_tokenizer: Path) -> None:
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
         vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
 
-        self.added_tokens_list = []
+        self.added_tokens_list = [tok for tok in self.tokenizer.get_added_vocab()]
+        self.added_tokens_dict = dict(self.tokenizer.get_added_vocab())
+        self.added_tokens_ids = set(self.tokenizer.get_added_vocab().values())
+        
         self.unk_token_id = self.tokenizer.unk_token_id
+        self.specials = {
+            tok: self.tokenizer.vocab[tok]
+            for tok in self.tokenizer.all_special_tokens
+        }
+        print(self.specials)
         self.special_ids = set(self.tokenizer.all_special_ids)
         self.vocab_size_base: int = len(vocab_set)
         self.vocab_size: int = len(vocab_set)
@@ -321,6 +329,7 @@ def __init__(self, fname_tokenizer: Path) -> None:
         path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
             self.spm = SentencePieceProcessor(str(path_candidate))
+            print(self.spm.vocab_size(), self.vocab_size_base)
         else:
             self.spm
 
@@ -330,18 +339,16 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         special_ids = set(tokenizer.all_special_ids)
         
         for i in range(self.vocab_size_base):
+            if i in self.added_tokens_ids:
+                continue
+                
             text = reverse_vocab[i].encode("utf-8")
             yield text, self.get_token_score(i), self.get_token_type(i)
 
-    def get_token_type(self, token_id):
-        toktype = gguf.TokenType.NORMAL
+    def get_token_type(self, token_id, default_type=gguf.TokenType.NORMAL):
+        toktype = default_type
         
-        if self.spm is None:
-            if i == self.unk_token_id:
-                toktype = gguf.TokenType.UNKNOWN
-            if i in self.special_ids:
-                toktype = gguf.TokenType.CONTROL
-        else:
+        if self.spm is not None and token_id < self.spm.vocab_size():
             if self.spm.is_unknown(token_id):
                 toktype = gguf.TokenType.UNKNOWN
             if self.spm.is_control(token_id):
@@ -350,18 +357,35 @@ def get_token_type(self, token_id):
                 toktype = gguf.TokenType.UNUSED
             if self.spm.is_byte(token_id):
                 toktype = gguf.TokenType.BYTE
+        else:
+            if token_id == self.unk_token_id:
+                toktype = gguf.TokenType.UNKNOWN
+            if token_id in self.special_ids:
+                toktype = gguf.TokenType.CONTROL
+
         return toktype
 
     def get_token_score(self, token_id):
-        if self.spm is not None:
+        if self.spm is not None and token_id < self.spm.vocab_size():
             return self.spm.get_score(token_id)
         else:
             return 0.0
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        default_toktype = gguf.TokenType.USER_DEFINED
+        
         for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+            
+            if text in self.specials:
+                
+                toktype = self.get_token_type(self.specials[text], default_toktype)
+                score = self.get_token_score(self.specials[text])
+                
+            else:
+                toktype = default_toktype
+                score = -1000.0
+
+            yield text.encode("utf-8"), score, toktype
 
     def has_newline_token(self):
         return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab

From dcf372e60ecd520f947e80eb95f8873c648b1613 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 12 Nov 2023 03:26:46 +0900
Subject: [PATCH 21/30] Update convert.py

---
 convert.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/convert.py b/convert.py
index c79318887a86e..45123adb297d7 100755
--- a/convert.py
+++ b/convert.py
@@ -345,8 +345,8 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
             text = reverse_vocab[i].encode("utf-8")
             yield text, self.get_token_score(i), self.get_token_type(i)
 
-    def get_token_type(self, token_id, default_type=gguf.TokenType.NORMAL):
-        toktype = default_type
+    def get_token_type(self, token_id):
+        toktype = gguf.TokenType.NORMAL
         
         if self.spm is not None and token_id < self.spm.vocab_size():
             if self.spm.is_unknown(token_id):
@@ -372,17 +372,16 @@ def get_token_score(self, token_id):
             return 0.0
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        default_toktype = gguf.TokenType.USER_DEFINED
         
         for text in self.added_tokens_list:
             
             if text in self.specials:
                 
-                toktype = self.get_token_type(self.specials[text], default_toktype)
+                toktype = self.get_token_type(self.specials[text])
                 score = self.get_token_score(self.specials[text])
                 
             else:
-                toktype = default_toktype
+                toktype = gguf.TokenType.USER_DEFINED
                 score = -1000.0
 
             yield text.encode("utf-8"), score, toktype

From cc1f3fcfadd5321a2e2e8e4f5cdf0f1ba49d2468 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Wed, 15 Nov 2023 17:22:59 +0900
Subject: [PATCH 22/30] Fix typo in convert.py

---
 convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index 45123adb297d7..ece023dc69a30 100755
--- a/convert.py
+++ b/convert.py
@@ -331,7 +331,7 @@ def __init__(self, fname_tokenizer: Path) -> None:
             self.spm = SentencePieceProcessor(str(path_candidate))
             print(self.spm.vocab_size(), self.vocab_size_base)
         else:
-            self.spm
+            self.spm = None
 
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer

From 026eb7cd01e6dd478357c60608511b0fd55f5301 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sat, 18 Nov 2023 12:55:14 +0900
Subject: [PATCH 23/30] Fix when params.n_vocab < tokenizer vocab size

---
 convert.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/convert.py b/convert.py
index ece023dc69a30..ae47e198d3b65 100755
--- a/convert.py
+++ b/convert.py
@@ -298,7 +298,7 @@ def load(model_plus: ModelPlus) -> Params:
 
 
 class VocabLoader:
-    def __init__(self, fname_tokenizer: Path) -> None:
+    def __init__(self, params: Params, fname_tokenizer: Path) -> None:
         try:
             from transformers import AutoTokenizer
         except ImportError as e:
@@ -309,10 +309,18 @@ def __init__(self, fname_tokenizer: Path) -> None:
 
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
         vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
-
-        self.added_tokens_list = [tok for tok in self.tokenizer.get_added_vocab()]
-        self.added_tokens_dict = dict(self.tokenizer.get_added_vocab())
-        self.added_tokens_ids = set(self.tokenizer.get_added_vocab().values())
+        
+        self.added_tokens_list = []
+        self.added_tokens_dict = dict()
+        self.added_tokens_ids = set()
+        
+        for tok, tokidx in self.tokenizer.get_added_vocab().items():
+            if tokidx >= params.n_vocab or toksize < self.tokenizer.vocab_size:
+                continue
+                
+            self.added_tokens_list.append(tok)
+            self.added_tokens_dict[tok] = tokidx
+            self.added_tokens_ids.add(tokidx)
         
         self.unk_token_id = self.tokenizer.unk_token_id
         self.specials = {
@@ -321,8 +329,8 @@ def __init__(self, fname_tokenizer: Path) -> None:
         }
         print(self.specials)
         self.special_ids = set(self.tokenizer.all_special_ids)
-        self.vocab_size_base: int = len(vocab_set)
-        self.vocab_size: int = len(vocab_set)
+        self.vocab_size_base: int = self.tokenizer.vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
         self.fname_tokenizer = fname_tokenizer
         
         vocab_file = "tokenizer.model"
@@ -374,7 +382,6 @@ def get_token_score(self, token_id):
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         
         for text in self.added_tokens_list:
-            
             if text in self.specials:
                 
                 toktype = self.get_token_type(self.specials[text])
@@ -1095,14 +1102,14 @@ def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool:
     return path
 
 
-def load_vocab(path: Path) -> Vocab:
+def load_vocab(params: Params, path: Path) -> Vocab:
     # Be extra-friendly and accept either a file or a directory.  Also, if it's
     # a directory, it might be the model directory, and tokenizer.model might
     # be in the parent of that.
             
     print(f"Loading vocab file '{path}'")
 
-    return VocabLoader(path)
+    return VocabLoader(params, path)
 
 
 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@@ -1183,7 +1190,7 @@ def main(args_in: list[str] | None = None) -> None:
         if not args.outfile:
             raise ValueError("need --outfile if using --vocab-only")
         # FIXME: Try to respect vocab_dir somehow?
-        vocab = load_vocab(args.vocab_dir or args.model)
+        vocab = load_vocab(params, args.vocab_dir or args.model)
         special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
                                           load_merges = True,
                                           n_vocab = vocab.vocab_size)
@@ -1197,7 +1204,7 @@ def main(args_in: list[str] | None = None) -> None:
         vocab = model_plus.vocab
     else:
         vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-        vocab = load_vocab(vocab_dir)
+        vocab = load_vocab(params, vocab_dir)
 
     # FIXME: Try to respect vocab_dir somehow?
     print(f"Vocab info: {vocab}")

From 2e263ca2003529943c97699bd53c6805f4435736 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 19 Nov 2023 10:20:06 +0900
Subject: [PATCH 24/30] update vocab class

---
 convert.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/convert.py b/convert.py
index ae47e198d3b65..6174e629bb587 100755
--- a/convert.py
+++ b/convert.py
@@ -307,15 +307,19 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
                 "You can install it with `pip install transformers`."
             ) from e
 
-        self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
-        vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
+            vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
+        except:
+            self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
+            vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
         
         self.added_tokens_list = []
         self.added_tokens_dict = dict()
         self.added_tokens_ids = set()
         
-        for tok, tokidx in self.tokenizer.get_added_vocab().items():
-            if tokidx >= params.n_vocab or toksize < self.tokenizer.vocab_size:
+        for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
+            if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
                 continue
                 
             self.added_tokens_list.append(tok)
@@ -324,7 +328,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
         
         self.unk_token_id = self.tokenizer.unk_token_id
         self.specials = {
-            tok: self.tokenizer.vocab[tok]
+            tok: self.tokenizer.get_vocab()[tok]
             for tok in self.tokenizer.all_special_tokens
         }
         print(self.specials)
@@ -343,7 +347,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
 
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
-        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
         special_ids = set(tokenizer.all_special_ids)
         
         for i in range(self.vocab_size_base):

From 5ac1949fff740425d32a91066551208c8fde0d05 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Wed, 22 Nov 2023 19:54:04 +0900
Subject: [PATCH 25/30] change funtion name

---
 convert.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/convert.py b/convert.py
index 6174e629bb587..20e2be5c2d639 100755
--- a/convert.py
+++ b/convert.py
@@ -338,7 +338,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
         self.fname_tokenizer = fname_tokenizer
         
         vocab_file = "tokenizer.model"
-        path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
+        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
             self.spm = SentencePieceProcessor(str(path_candidate))
             print(self.spm.vocab_size(), self.vocab_size_base)
@@ -407,19 +407,19 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
     def get_vocab_type(self) -> str:
         path_candidates = []
         vocab_file = "tokenizer.model"
-        path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
+        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
             return "llama"
 
         path_candidates.append(path_candidate)
         vocab_file = "vocab.json"
-        path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
+        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
             return "gpt2"
 
         path_candidates.append(path_candidate)
         vocab_file = "tokenizer.json"
-        path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
+        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate:
             if not self.has_newline_token(): 
                 return "gpt2"
@@ -1091,7 +1091,7 @@ def load_some_model(path: Path) -> ModelPlus:
     return model_plus
 
 
-def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool:
+def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
     path2 = path / vocab_file
     # Use `.parent` instead of /.. to handle the symlink case better.
     path3 = path.parent / vocab_file

From 61edd1bc5999480e71f1e3121ea54e33288ce519 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Tue, 28 Nov 2023 16:23:27 +0900
Subject: [PATCH 26/30] Remove unused variable/functions, add types to class
 variable and methods, delete blank liens

---
 convert.py | 80 ++++++++++++++++++++++--------------------------------
 1 file changed, 33 insertions(+), 47 deletions(-)

diff --git a/convert.py b/convert.py
index 36f38472dae8b..65ba4334507d8 100644
--- a/convert.py
+++ b/convert.py
@@ -18,6 +18,7 @@
 import time
 import zipfile
 from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
@@ -313,30 +314,25 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
         except:
             self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
             vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
-        
-        self.added_tokens_list = []
-        self.added_tokens_dict = dict()
-        self.added_tokens_ids = set()
-        
+
+        self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
+
         for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
             if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
                 continue
-                
-            self.added_tokens_list.append(tok)
+
             self.added_tokens_dict[tok] = tokidx
-            self.added_tokens_ids.add(tokidx)
-        
-        self.unk_token_id = self.tokenizer.unk_token_id
-        self.specials = {
+
+        self.unk_token_id: int = self.tokenizer.unk_token_id
+        self.specials: dict[str, int] = {
             tok: self.tokenizer.get_vocab()[tok]
             for tok in self.tokenizer.all_special_tokens
         }
-        print(self.specials)
-        self.special_ids = set(self.tokenizer.all_special_ids)
+        self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
         self.vocab_size_base: int = self.tokenizer.vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
+        self.fname_tokenizer: str = fname_tokenizer
+
         vocab_file = "tokenizer.model"
         path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
@@ -348,18 +344,18 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
     def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         tokenizer = self.tokenizer
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
-        special_ids = set(tokenizer.all_special_ids)
-        
+        added_tokens_ids = set(self.added_tokens_dict.values())
+
         for i in range(self.vocab_size_base):
-            if i in self.added_tokens_ids:
+            if i in added_tokens_ids:
                 continue
-                
+
             text = reverse_vocab[i].encode("utf-8")
             yield text, self.get_token_score(i), self.get_token_type(i)
 
-    def get_token_type(self, token_id):
+    def get_token_type(self, token_id: int) -> gguf.TokenType:
         toktype = gguf.TokenType.NORMAL
-        
+
         if self.spm is not None and token_id < self.spm.vocab_size():
             if self.spm.is_unknown(token_id):
                 toktype = gguf.TokenType.UNKNOWN
@@ -377,27 +373,27 @@ def get_token_type(self, token_id):
 
         return toktype
 
-    def get_token_score(self, token_id):
+    def get_token_score(self, token_id: int) -> float:
         if self.spm is not None and token_id < self.spm.vocab_size():
             return self.spm.get_score(token_id)
         else:
             return 0.0
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        
-        for text in self.added_tokens_list:
+
+        for text in self.added_tokens_dict:
             if text in self.specials:
-                
+
                 toktype = self.get_token_type(self.specials[text])
                 score = self.get_token_score(self.specials[text])
-                
+
             else:
                 toktype = gguf.TokenType.USER_DEFINED
                 score = -1000.0
 
             yield text.encode("utf-8"), score, toktype
 
-    def has_newline_token(self):
+    def has_newline_token(self) -> bool:
         return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
 
     def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -421,7 +417,7 @@ def get_vocab_type(self) -> str:
         vocab_file = "tokenizer.json"
         path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate:
-            if not self.has_newline_token(): 
+            if not self.has_newline_token():
                 return "gpt2"
             else:
                 return "llama"
@@ -432,7 +428,7 @@ def get_vocab_type(self) -> str:
                     "if it's in another directory, pass the directory as --vocab-dir")
 
     def __repr__(self) -> str:
-        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
 
 Vocab: TypeAlias = 'VocabLoader'
 
@@ -814,15 +810,15 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
     if params.n_vocab != vocab.vocab_size:
         if params.n_vocab == vocab.vocab_size:
             print("Ignoring added_tokens.json since model matches vocab size without it.")
-            vocab.added_tokens_list = []
+            vocab.added_tokens_dict = OrderedDict()
             vocab.vocab_size = vocab.vocab_size
             return
-            
+
         if pad_vocab and params.n_vocab > vocab.vocab_size:
             pad_count = params.n_vocab - vocab.vocab_size
             print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
             for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
-                vocab.added_tokens_list.append(f'<dummy{i:05}>')
+                vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
             vocab.vocab_size = params.n_vocab
             return
         msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
@@ -1112,25 +1108,15 @@ def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
     path2 = path / vocab_file
     # Use `.parent` instead of /.. to handle the symlink case better.
     path3 = path.parent / vocab_file
-    
+
     if path2.exists():
         path = path2
     elif path3.exists():
         path = path3
     else:
         path = None
-        
-    return path
-
 
-def load_vocab(params: Params, path: Path) -> Vocab:
-    # Be extra-friendly and accept either a file or a directory.  Also, if it's
-    # a directory, it might be the model directory, and tokenizer.model might
-    # be in the parent of that.
-            
-    print(f"Loading vocab file '{path}'")
-
-    return VocabLoader(params, path)
+    return path
 
 
 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@@ -1215,7 +1201,7 @@ def main(args_in: list[str] | None = None) -> None:
         if not args.outfile:
             raise ValueError("need --outfile if using --vocab-only")
         # FIXME: Try to respect vocab_dir somehow?
-        vocab = load_vocab(params, args.vocab_dir or args.model)
+        vocab = VocabLoader(params, args.vocab_dir or args.model)
         special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
                                           load_merges = True,
                                           n_vocab = vocab.vocab_size)
@@ -1229,7 +1215,7 @@ def main(args_in: list[str] | None = None) -> None:
         vocab = model_plus.vocab
     else:
         vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-        vocab = load_vocab(params, vocab_dir)
+        vocab = VocabLoader(params, vocab_dir)
 
     # FIXME: Try to respect vocab_dir somehow?
     print(f"Vocab info: {vocab}")

From 1f5357cbcf3be28a66883c42f9c3521597c070fa Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Tue, 28 Nov 2023 16:46:54 +0900
Subject: [PATCH 27/30] fix flake8 warnings

---
 convert.py | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/convert.py b/convert.py
index 65ba4334507d8..4f38ece0f4c88 100644
--- a/convert.py
+++ b/convert.py
@@ -22,7 +22,7 @@
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar, Optional
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor
@@ -310,10 +310,8 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
 
         try:
             self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
-            vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
-        except:
+        except Exception:
             self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
-            vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
 
         self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
 
@@ -423,15 +421,15 @@ def get_vocab_type(self) -> str:
                 return "llama"
 
         path_candidates.append(path_candidate)
-        raise FileNotFoundError(
-                    f"Could not find {find_candidates} in {path} or its parent; "
-                    "if it's in another directory, pass the directory as --vocab-dir")
+        raise FileNotFoundError(f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; if it's in another directory, pass the directory as --vocab-dir")
 
     def __repr__(self) -> str:
         return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
 
+
 Vocab: TypeAlias = 'VocabLoader'
 
+
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
@@ -806,6 +804,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
                     break
             yield result
 
+
 def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
     if params.n_vocab != vocab.vocab_size:
         if params.n_vocab == vocab.vocab_size:
@@ -907,11 +906,10 @@ def close(self) -> None:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(
-        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab: bool            = False,
-        ) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab,
+                         svocab: gguf.SpecialVocab,
+                         endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+                         pad_vocab: bool            = False) -> None:
         check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
@@ -939,13 +937,11 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(
-        fname_out  : Path, ftype: GGMLFileType, params: Params,
-        model      : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
-        concurrency: int             = DEFAULT_CONCURRENCY,
-        endianess  : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab  : bool            = False,
-        ) -> None:
+    def write_all(fname_out  : Path, ftype: GGMLFileType, params: Params,
+                  model      : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+                  concurrency: int             = DEFAULT_CONCURRENCY,
+                  endianess  : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+                  pad_vocab  : bool            = False) -> None:
         check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
@@ -1207,7 +1203,7 @@ def main(args_in: list[str] | None = None) -> None:
                                           n_vocab = vocab.vocab_size)
         outfile = args.outfile
         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-            endianess = endianess, pad_vocab = args.padvocab)
+                                    endianess = endianess, pad_vocab = args.padvocab)
         print(f"Wrote {outfile}")
         return
 
@@ -1234,7 +1230,7 @@ def main(args_in: list[str] | None = None) -> None:
     print(f"Writing {outfile}, format {ftype}")
 
     OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-        concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
+                         concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
     print(f"Wrote {outfile}")
 
 

From 8fabb0132cf53fe9c4fe18bbbe3c1aa8d09b793c Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 13:03:24 -0500
Subject: [PATCH 28/30] code style cleanup

---
 convert.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/convert.py b/convert.py
index 4f38ece0f4c88..6eab0ee34602a 100644
--- a/convert.py
+++ b/convert.py
@@ -10,6 +10,7 @@
 import json
 import math
 import mmap
+import os
 import pickle
 import re
 import signal
@@ -22,12 +23,11 @@
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar, Optional
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor
 
-import os
 if 'NO_LOCAL_GGUF' not in os.environ:
     sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -417,11 +417,13 @@ def get_vocab_type(self) -> str:
         if path_candidate:
             if not self.has_newline_token():
                 return "gpt2"
-            else:
-                return "llama"
+            return "llama"
 
         path_candidates.append(path_candidate)
-        raise FileNotFoundError(f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; if it's in another directory, pass the directory as --vocab-dir")
+        raise FileNotFoundError(
+            f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
+            "if it's in another directory, pass the directory as --vocab-dir"
+        )
 
     def __repr__(self) -> str:
         return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
@@ -906,10 +908,11 @@ def close(self) -> None:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab,
-                         svocab: gguf.SpecialVocab,
-                         endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-                         pad_vocab: bool            = False) -> None:
+    def write_vocab_only(
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool = False,
+    ) -> None:
         check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
@@ -937,11 +940,12 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(fname_out  : Path, ftype: GGMLFileType, params: Params,
-                  model      : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
-                  concurrency: int             = DEFAULT_CONCURRENCY,
-                  endianess  : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-                  pad_vocab  : bool            = False) -> None:
+    def write_all(
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        concurrency: int = DEFAULT_CONCURRENCY,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool = False,
+    ) -> None:
         check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)

From c3b1c12fdd454cf9a78eb7cb7dc83508319b56d7 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 13:03:57 -0500
Subject: [PATCH 29/30] make mypy happy

---
 convert.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/convert.py b/convert.py
index 6eab0ee34602a..49fa3fc168087 100644
--- a/convert.py
+++ b/convert.py
@@ -329,7 +329,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
         self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
         self.vocab_size_base: int = self.tokenizer.vocab_size
         self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
-        self.fname_tokenizer: str = fname_tokenizer
+        self.fname_tokenizer: Path = fname_tokenizer
 
         vocab_file = "tokenizer.model"
         path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
@@ -373,9 +373,8 @@ def get_token_type(self, token_id: int) -> gguf.TokenType:
 
     def get_token_score(self, token_id: int) -> float:
         if self.spm is not None and token_id < self.spm.vocab_size():
-            return self.spm.get_score(token_id)
-        else:
-            return 0.0
+            return cast(float, self.spm.get_score(token_id))
+        return 0.0
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
 
@@ -1110,13 +1109,11 @@ def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
     path3 = path.parent / vocab_file
 
     if path2.exists():
-        path = path2
-    elif path3.exists():
-        path = path3
-    else:
-        path = None
+        return path2
+    if path3.exists():
+        return path3
 
-    return path
+    return None
 
 
 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:

From 35e95b6266a0d0fb4fccfb63db3a8eeb1833d283 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Thu, 14 Dec 2023 08:33:10 +0900
Subject: [PATCH 30/30] change exception

---
 convert.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/convert.py b/convert.py
index 49fa3fc168087..9f7ab060a919d 100644
--- a/convert.py
+++ b/convert.py
@@ -310,7 +310,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
 
         try:
             self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
-        except Exception:
+        except ValueError:
             self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
 
         self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
@@ -400,25 +400,25 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
     def get_vocab_type(self) -> str:
         path_candidates = []
         vocab_file = "tokenizer.model"
+        path_candidates.append(vocab_file)
         path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
             return "llama"
 
-        path_candidates.append(path_candidate)
         vocab_file = "vocab.json"
+        path_candidates.append(vocab_file)
         path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate is not None:
             return "gpt2"
 
-        path_candidates.append(path_candidate)
         vocab_file = "tokenizer.json"
+        path_candidates.append(vocab_file)
         path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
         if path_candidate:
             if not self.has_newline_token():
                 return "gpt2"
             return "llama"
 
-        path_candidates.append(path_candidate)
         raise FileNotFoundError(
             f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
             "if it's in another directory, pass the directory as --vocab-dir"