From c930f593f5fc592e5b4aae3b9caec063682d066f Mon Sep 17 00:00:00 2001 From: Qubitium <417764+Qubitium@users.noreply.github.com> Date: Thu, 7 Mar 2024 02:24:21 +0000 Subject: [PATCH] Fixed the issue of being unable to handle added/expanded model tokens. For transformers, tokenizer.vocab_size excludes all tokens added via token expansion. Correct usage here is len(tokenizer). --- lmformatenforcer/integrations/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmformatenforcer/integrations/transformers.py b/lmformatenforcer/integrations/transformers.py index 839270e..7bcbc97 100644 --- a/lmformatenforcer/integrations/transformers.py +++ b/lmformatenforcer/integrations/transformers.py @@ -55,7 +55,7 @@ def unreplace_logits_warper(self): def _build_regular_tokens_list(tokenizer: PreTrainedTokenizerBase) -> List[Tuple[int, str, bool]]: token_0 = tokenizer.encode("0")[-1] regular_tokens = [] - for token_idx in range(tokenizer.vocab_size): + for token_idx in range(len(tokenizer)): if token_idx in tokenizer.all_special_ids: continue # We prepend token 0 and skip the first letter of the result to get a space if the token is a start word.