Fixed #18

noamgat · Nov 6, 2023 · a0ea20e · a0ea20e
1 parent adea08a
commit a0ea20e
Showing 1 changed file with 5 additions and 2 deletions.
diff --git a/lmformatenforcer/tokenizerprefixtree.py b/lmformatenforcer/tokenizerprefixtree.py
@@ -13,9 +13,12 @@ def __init__(self, regular_tokens: List[Tuple[int, str]]):
         self.json_freetext_tokens: List[int] = []
         for token_idx, decoded in regular_tokens:
             self._add_token_to_tree(decoded, token_idx, self.root)
-            # Performance optimization - cache the tokens of all the strings that don't contain a quote in the middle.
+            # Performance optimization - cache the tokens of all the strings that don't contain a quote in the middle, or a line break.
             # When we are in a JSON freetext string field, they will all be permitted and this will save a lot of tree iterations.
-            if '"' not in decoded or decoded.index('"') == len(decoded) - 1:
+            has_quote_before_end = '"' in decoded[0:-1]
+            has_newline = "\n" in decoded or "\r" in decoded
+
+            if not (has_quote_before_end or has_newline):
                 self.json_freetext_tokens.append(token_idx)
 
     def _add_token_to_tree(self, token_str: str, token_idx: int, node: TokenizerPrefixTreeNode):