Skip to content

Commit

Permalink
feat(jsonish): Add support for curly quotes in JSON parsing
Browse files Browse the repository at this point in the history
LLMs sometimes output JSON with curly quotes (U+201C/U+201D) instead of straight quotes (U+0022). This change adds automatic normalization of these quotes during parsing, making the JSON parser more robust to LLM output variations.

Changes:\n- Add normalize_quotes() function to convert curly quotes to straight quotes\n- Apply normalization before all parsing attempts (serde, markdown, multi-json)\n- Preserve original string for error messages and output\n\nFixes BoundaryML#1074
  • Loading branch information
afyef committed Dec 16, 2024
1 parent d1952d6 commit 0513ae7
Showing 1 changed file with 23 additions and 5 deletions.
28 changes: 23 additions & 5 deletions engine/baml-lib/jsonish/src/jsonish/parser/entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,22 @@ use crate::jsonish::{

use super::ParseOptions;

pub fn parse(str: &str, mut options: ParseOptions) -> Result<Value> {

/// Normalizes Unicode quotes in a string to standard ASCII double quotes.
///
/// This function handles the following conversions:
/// - Left double quotation mark (U+201C) → Basic quotation mark (U+0022)
/// - Right double quotation mark (U+201D) → Basic quotation mark (U+0022)
///
/// This normalization is necessary because LLMs may output JSON with curly quotes
/// that would otherwise be valid JSON if using standard quotes.
fn normalize_quotes(s: &str) -> String {
// Convert both left (U+201C) and right (U+201D) curly quotes to straight quotes (U+0022)
s.replace('\u{201C}', "\u{0022}").replace('\u{201D}', "\u{0022}")
}

pub fn parse<'a>(str: &'a str, mut options: ParseOptions) -> Result<Value> {
log::debug!("Parsing:\n{:?}\n-------\n{}\n-------", options, str);

options.depth += 1;
Expand All @@ -22,15 +37,18 @@ pub fn parse(str: &str, mut options: ParseOptions) -> Result<Value> {
));
}

match serde_json::from_str(str) {
// First normalize any curly quotes
let normalized = normalize_quotes(str);

match serde_json::from_str(&normalized) {
Ok(v) => return Ok(Value::AnyOf(vec![v], str.to_string())),
Err(e) => {
log::debug!("Invalid JSON: {:?}", e);
}
};

if options.allow_markdown_json {
match markdown_parser::parse(str, &options) {
match markdown_parser::parse(&normalized, &options) {
Ok(items) => match items.len() {
0 => {}
1 => {
Expand Down Expand Up @@ -103,7 +121,7 @@ pub fn parse(str: &str, mut options: ParseOptions) -> Result<Value> {
}

if options.all_finding_all_json_objects {
match multi_json_parser::parse(str, &options) {
match multi_json_parser::parse(&normalized, &options) {
Ok(items) => match items.len() {
0 => {}
1 => {
Expand Down Expand Up @@ -136,7 +154,7 @@ pub fn parse(str: &str, mut options: ParseOptions) -> Result<Value> {
}

if options.allow_fixes {
match fixing_parser::parse(str, &options) {
match fixing_parser::parse(&normalized, &options) {
Ok(items) => {
match items.len() {
0 => {}
Expand Down

0 comments on commit 0513ae7

Please sign in to comment.