Skip to content

Commit

Permalink
Update bruteforce test: fix pyright complaints
Browse files Browse the repository at this point in the history
  • Loading branch information
jaime-m-p committed Aug 5, 2024
1 parent 735105e commit fd6d9b9
Showing 1 changed file with 3 additions and 11 deletions.
14 changes: 3 additions & 11 deletions tests/test-tokenizer-random.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,7 @@ def get_vocab(self, detokenize=False) -> list[str]:
text = self.detokenize([id], remove_special=False, unparse_special=True)
else:
text = self.lib.llama_token_get_text(self.model, id)
text = self.ffi.string(text)
text = str(text, encoding="utf-8", errors="replace") # replace errors with '\uFFFD'
text = str(cast(bytes, self.ffi.string(text)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD'
vocab.append(text)
return vocab

Expand Down Expand Up @@ -162,12 +161,13 @@ def __init__(self, dir_tokenizer: str):
self.eos_token = self.model.eos_token

def get_vocab(self, detokenize=False) -> list[str]:
vocab: list[str] = []
max_token_id = max(self.model.get_vocab().values())
if detokenize:
ids = list(range(max_token_id + 1))
vocab = self.model.batch_decode(ids, skip_special_tokens=False)
else:
vocab = [None] * (max_token_id + 1)
vocab = [""] * (max_token_id + 1)
for text, id in self.model.get_vocab().items():
vocab[id] = text
return vocab
Expand Down Expand Up @@ -455,14 +455,6 @@ def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100

def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):

def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
for i, (a, b) in enumerate(zip(ids1, ids2)):
if a != b:
return i
if len(ids1) == len(ids2):
return -1
return min(len(ids1), len(ids2))

def check_detokenizer(text: str, text1: str, text2: str) -> bool:
if text1 == text2: # equal to TokenizerGroundtruth?
return True
Expand Down

0 comments on commit fd6d9b9

Please sign in to comment.