Skip to content

Commit

Permalink
Fixes illegal escape sequences sometimes appearing in json strings. F…
Browse files Browse the repository at this point in the history
…ixes #41
  • Loading branch information
noamgat committed Dec 16, 2023
1 parent 5afc3ce commit 7ed78bb
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 2 deletions.
9 changes: 8 additions & 1 deletion lmformatenforcer/tokenizerprefixtree.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, List, Tuple

import json

class TokenizerPrefixTreeNode:
def __init__(self):
Expand All @@ -19,6 +19,13 @@ def __init__(self, regular_tokens: List[Tuple[int, str]]):
has_newline = "\n" in decoded or "\r" in decoded

if not (has_quote_before_end or has_newline):
if '\\' in decoded[:-1]:
# If there is a backslash that is not trailing, we might be in an illegal json territory. Need to verify
# that is is a legal json character streak
try:
json.loads(f'"{decoded}"')
except json.decoder.JSONDecodeError:
continue
self.json_freetext_tokens.append(token_idx)

def _add_token_to_tree(self, token_str: str, token_idx: int, node: TokenizerPrefixTreeNode):
Expand Down
11 changes: 10 additions & 1 deletion tests/test_jsonschemaparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,16 @@ class SomeSchema(BaseModel):
test_string = '{"key": "val",'
with pytest.raises(CharacterNotAllowedException):
_test_json_schema_parsing_with_string(test_string, SomeSchema.schema(), True)



def test_single_quote_must_not_be_escaped():
class SomeSchema(BaseModel):
key: str

test_string = '{"key": "I\\\'m a string"}'
with pytest.raises(CharacterNotAllowedException):
_test_json_schema_parsing_with_string(test_string, SomeSchema.schema(), True)


def test_string_length_limitation():
class SomeSchema(BaseModel):
Expand Down

0 comments on commit 7ed78bb

Please sign in to comment.