When advancing parsing state, not checking if new character is in get…

…_allowed_characters(), this enables support for out-of-vocabulary characters to not stop generation.
noamgat · Mar 13, 2024 · 142a5a6 · 142a5a6
1 parent fbcf5af
commit 142a5a6
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/lmformatenforcer/tokenenforcer.py b/lmformatenforcer/tokenenforcer.py
@@ -155,11 +155,11 @@ def _apply_new_characters(self, state: 'TokenEnforcer.OutputTensorState', token_
             new_decoded = self.decoder(new_state.current_word_tokens)
             new_characters = new_decoded[len(prev_decoded):]
         for character in new_characters:
-            if character in new_state.parser.get_allowed_characters():
+            try:
                 new_state.parser = new_state.parser.add_character(character)
-            else:
+            except Exception as e:
                 # This can happen in beam / batch scenarios, when some of the batches finished but others are continuing.
-                logging.debug(f"Received an invalid character '{character}', switching to ForceStopParser")
+                logging.debug(f"Received an invalid character '{character}', switching to ForceStopParser (Exception:{e})")
                 new_state.parser = ForceStopParser()
         return new_state