Update bruteforce test: fix pyright complaints

ggerganov · Aug 5, 2024 · fd6d9b9 · fd6d9b9
1 parent 735105e
commit fd6d9b9
Showing 1 changed file with 3 additions and 11 deletions.
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
@@ -124,8 +124,7 @@ def get_vocab(self, detokenize=False) -> list[str]:
                 text = self.detokenize([id], remove_special=False, unparse_special=True)
             else:
                 text = self.lib.llama_token_get_text(self.model, id)
-                text = self.ffi.string(text)
-                text = str(text, encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
+                text = str(cast(bytes, self.ffi.string(text)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
             vocab.append(text)
         return vocab
 
@@ -162,12 +161,13 @@ def __init__(self, dir_tokenizer: str):
         self.eos_token = self.model.eos_token
 
     def get_vocab(self, detokenize=False) -> list[str]:
+        vocab: list[str] = []
         max_token_id = max(self.model.get_vocab().values())
         if detokenize:
             ids = list(range(max_token_id + 1))
             vocab = self.model.batch_decode(ids, skip_special_tokens=False)
         else:
-            vocab = [None] * (max_token_id + 1)
+            vocab = [""] * (max_token_id + 1)
             for text, id in self.model.get_vocab().items():
                 vocab[id] = text
         return vocab
@@ -455,14 +455,6 @@ def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100
 
 def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
 
-    def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
-        for i, (a, b) in enumerate(zip(ids1, ids2)):
-            if a != b:
-                return i
-        if len(ids1) == len(ids2):
-            return -1
-        return min(len(ids1), len(ids2))
-
     def check_detokenizer(text: str, text1: str, text2: str) -> bool:
         if text1 == text2:  # equal to TokenizerGroundtruth?
             return True