From b1b9cabcd51f87afef0ef54c7ecd0e2349d97f83 Mon Sep 17 00:00:00 2001 From: hellovai Date: Wed, 13 Nov 2024 23:35:10 -0600 Subject: [PATCH] support sigle line quoteless json parsing (#1170) Prior: minified json didn't parse, now we support minified json for numeric boolean and null values. We could improve it further for literals. > [!IMPORTANT] > Add support for parsing single-line JSON without quotes for numbers, booleans, and null in `JsonParseState`, with new test cases to validate functionality. > > - **Behavior**: > - Support parsing single-line JSON without quotes for numbers, booleans, and null in `JsonParseState`. > - Handles cases where values are followed by a comma and space, ensuring correct parsing. > - **Tests**: > - Add `test_recursive_union_on_multiple_fields_single_line_without_quotes` and `test_recursive_single_line` in `test_class.rs` to validate new parsing behavior. > - Add `test_recursive_union_on_multiple_fields_single_line_without_quotes_complex` to test complex nested structures without quotes. > > This description was created by [Ellipsis](https://www.ellipsis.dev?ref=BoundaryML%2Fbaml&utm_source=github&utm_medium=referral) for 9aa485c6861054e8463758dcaa02e1b8832843a8. It will automatically update as commits are pushed. --- .../parser/fixing_parser/json_parse_state.rs | 19 ++- .../baml-lib/jsonish/src/tests/test_class.rs | 114 ++++++++++++++++-- 2 files changed, 123 insertions(+), 10 deletions(-) diff --git a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs index a1401a3ef..c63dddfce 100644 --- a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs +++ b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs @@ -173,6 +173,20 @@ impl JsonParseState { counter = idx; match c { ',' => { + // Check if we have just numeric values in the string so far. + let Some((JsonCollection::UnquotedString(current_value), _)) = + self.collection_stack.last() + else { + return Some(idx); + }; + + // current value could be a numeric looking things. + let is_numeric = current_value.trim().parse::().is_ok(); + let is_bool = current_value.trim().eq_ignore_ascii_case("true") + || current_value.trim().eq_ignore_ascii_case("false"); + let is_null = current_value.trim().eq_ignore_ascii_case("null"); + let is_possible_value = is_numeric || is_bool || is_null; + if let Some((_, next_c)) = next.peek() { match next_c { '\n' => { @@ -181,6 +195,9 @@ impl JsonParseState { } ' ' => { log::debug!("Testing for comment after space + comma"); + if is_possible_value { + return Some(idx); + } // If after the space we have "//" or "/*" or the beginning of a key, we'll close the string let mut buffer = ",".to_string(); let mut anything_but_whitespace = false; @@ -193,7 +210,7 @@ impl JsonParseState { '\n' => { if anything_but_whitespace { } else { - // Likely end of the key as the LLM generated a (', ' token by mistake) + // Likely end of the key as the LLM generated a ", " token by mistake instead of a "," // so drop the comma log::debug!("Closing due to: newline after comma + space"); return Some(idx); diff --git a/engine/baml-lib/jsonish/src/tests/test_class.rs b/engine/baml-lib/jsonish/src/tests/test_class.rs index 83887561b..2b67ec4af 100644 --- a/engine/baml-lib/jsonish/src/tests/test_class.rs +++ b/engine/baml-lib/jsonish/src/tests/test_class.rs @@ -1348,7 +1348,7 @@ test_deserializer!( ); test_deserializer!( - test_same_recursive_union_on_multiple_fields, + test_recursive_union_on_multiple_fields_single_line, r#"class Foo { rec_one Foo | int rec_two Foo | int @@ -1357,19 +1357,49 @@ test_deserializer!( r#" The answer is { + "rec_one": { "rec_one": 1, "rec_two": 2 }, + "rec_two": { + "rec_one": { "rec_one": 1, "rec_two": 2 }, + "rec_two": { "rec_one": 1, "rec_two": 2 } + } + }, + + Anything else I can help with? + "#, + FieldType::Class("Foo".to_string()), + { + "rec_one": { + "rec_one": 1, + "rec_two": 2 + }, + "rec_two": { "rec_one": { "rec_one": 1, "rec_two": 2 }, "rec_two": { - "rec_one": { - "rec_one": 1, - "rec_two": 2 - }, - "rec_two": { - "rec_one": 1, - "rec_two": 2 - } + "rec_one": 1, + "rec_two": 2 + } + }, + } +); + + +test_deserializer!( + test_recursive_union_on_multiple_fields_single_line_without_quotes, + r#"class Foo { + rec_one Foo | int + rec_two Foo | int + } + "#, + r#" + The answer is + { + rec_one: { rec_one: 1, rec_two: 2 }, + rec_two: { + rec_one: { rec_one: 1, rec_two: 2 }, + rec_two: { rec_one: 1, rec_two: 2 } } }, @@ -1393,3 +1423,69 @@ test_deserializer!( }, } ); + + +test_deserializer!( + test_recursive_single_line, + r#"class Foo { + rec_one Foo | int | bool + rec_two Foo | int | bool + } + "#, + r#" + The answer is + { rec_one: true, rec_two: false }, + + Anything else I can help with? + "#, + FieldType::Class("Foo".to_string()), + { + "rec_one": true, + "rec_two": false + } +); + + +test_deserializer!( + test_recursive_union_on_multiple_fields_single_line_without_quotes_complex, + r#"class Foo { + rec_one Foo | int | bool + rec_two Foo | int | bool | null + } + "#, + r#" + The answer is + { + rec_one: { rec_one: { rec_one: true, rec_two: false }, rec_two: null }, + rec_two: { + rec_one: { rec_one: { rec_one: 1, rec_two: 2 }, rec_two: null }, + rec_two: { rec_one: 1, rec_two: null } + } + }, + + Anything else I can help with? + "#, + FieldType::Class("Foo".to_string()), + { + "rec_one": { + "rec_one": { + "rec_one": true, + "rec_two": false + }, + "rec_two": null + }, + "rec_two": { + "rec_one": { + "rec_one": { + "rec_one": 1, + "rec_two": 2 + }, + "rec_two": null + }, + "rec_two": { + "rec_one": 1, + "rec_two": null + } + }, + } +);