Skip to content

Commit

Permalink
Fixed #18
Browse files Browse the repository at this point in the history
  • Loading branch information
noamgat committed Nov 6, 2023
1 parent adea08a commit a0ea20e
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions lmformatenforcer/tokenizerprefixtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ def __init__(self, regular_tokens: List[Tuple[int, str]]):
self.json_freetext_tokens: List[int] = []
for token_idx, decoded in regular_tokens:
self._add_token_to_tree(decoded, token_idx, self.root)
# Performance optimization - cache the tokens of all the strings that don't contain a quote in the middle.
# Performance optimization - cache the tokens of all the strings that don't contain a quote in the middle, or a line break.
# When we are in a JSON freetext string field, they will all be permitted and this will save a lot of tree iterations.
if '"' not in decoded or decoded.index('"') == len(decoded) - 1:
has_quote_before_end = '"' in decoded[0:-1]
has_newline = "\n" in decoded or "\r" in decoded

if not (has_quote_before_end or has_newline):
self.json_freetext_tokens.append(token_idx)

def _add_token_to_tree(self, token_str: str, token_idx: int, node: TokenizerPrefixTreeNode):
Expand Down

0 comments on commit a0ea20e

Please sign in to comment.