diff --git a/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs b/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs index 5a138c554..8d7a3f630 100644 --- a/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs +++ b/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs @@ -12,7 +12,22 @@ use crate::jsonish::{ use super::ParseOptions; -pub fn parse(str: &str, mut options: ParseOptions) -> Result { + +/// Normalizes Unicode quotes in a string to standard ASCII double quotes. +/// +/// This function handles the following conversions: +/// - Left double quotation mark (U+201C) → Basic quotation mark (U+0022) +/// - Right double quotation mark (U+201D) → Basic quotation mark (U+0022) +/// +/// This normalization is necessary because LLMs may output JSON with curly quotes +/// that would otherwise be valid JSON if using standard quotes. + +fn normalize_quotes(s: &str) -> String { + // Convert both left (U+201C) and right (U+201D) curly quotes to straight quotes (U+0022) + s.replace('\u{201C}', "\u{0022}").replace('\u{201D}', "\u{0022}") +} + +pub fn parse<'a>(str: &'a str, mut options: ParseOptions) -> Result { log::debug!("Parsing:\n{:?}\n-------\n{}\n-------", options, str); options.depth += 1; @@ -22,7 +37,10 @@ pub fn parse(str: &str, mut options: ParseOptions) -> Result { )); } - match serde_json::from_str(str) { + // First normalize any curly quotes + let normalized = normalize_quotes(str); + + match serde_json::from_str(&normalized) { Ok(v) => return Ok(Value::AnyOf(vec![v], str.to_string())), Err(e) => { log::debug!("Invalid JSON: {:?}", e); @@ -30,7 +48,7 @@ pub fn parse(str: &str, mut options: ParseOptions) -> Result { }; if options.allow_markdown_json { - match markdown_parser::parse(str, &options) { + match markdown_parser::parse(&normalized, &options) { Ok(items) => match items.len() { 0 => {} 1 => { @@ -103,7 +121,7 @@ pub fn parse(str: &str, mut options: ParseOptions) -> Result { } if options.all_finding_all_json_objects { - match multi_json_parser::parse(str, &options) { + match multi_json_parser::parse(&normalized, &options) { Ok(items) => match items.len() { 0 => {} 1 => { @@ -136,7 +154,7 @@ pub fn parse(str: &str, mut options: ParseOptions) -> Result { } if options.allow_fixes { - match fixing_parser::parse(str, &options) { + match fixing_parser::parse(&normalized, &options) { Ok(items) => { match items.len() { 0 => {} diff --git a/engine/language_client_python/Cargo.toml b/engine/language_client_python/Cargo.toml index 47f417678..0a5383b58 100644 --- a/engine/language_client_python/Cargo.toml +++ b/engine/language_client_python/Cargo.toml @@ -27,7 +27,9 @@ internal-baml-codegen.workspace = true env_logger.workspace = true futures.workspace = true indexmap.workspace = true +libc = "0.2" log.workspace = true +ctrlc = "3.4" # Consult https://pyo3.rs/main/migration for migration instructions pyo3 = { version = "0.23.3", default-features = false, features = [ "abi3-py38", @@ -44,6 +46,7 @@ regex.workspace = true serde.workspace = true serde_json.workspace = true tokio = { version = "1", features = ["full"] } +tokio-util = { version = "0.7", features = ["full"] } tracing-subscriber = { version = "0.3.18", features = [ "json", "env-filter", diff --git a/engine/language_client_python/src/lib.rs b/engine/language_client_python/src/lib.rs index 73de3ceaf..4764d02b9 100644 --- a/engine/language_client_python/src/lib.rs +++ b/engine/language_client_python/src/lib.rs @@ -7,9 +7,55 @@ use pyo3::prelude::{pyfunction, pymodule, PyAnyMethods, PyModule, PyResult}; use pyo3::types::PyModuleMethods; use pyo3::{wrap_pyfunction, Bound, Python}; use tracing_subscriber::{self, EnvFilter}; +use ctrlc; #[pyfunction] fn invoke_runtime_cli(py: Python) -> PyResult<()> { + // SIGINT (Ctrl+C) Handling Implementation, an approach from @revidious + // + // Background: + // When running BAML through Python, we face a challenge where Python's default SIGINT handling + // can interfere with graceful shutdown. This is because: + // 1. Python has its own signal handlers that may conflict with Rust's + // 2. The PyO3 runtime can sometimes mask or delay interrupt signals + // 3. We need to ensure clean shutdown across the Python/Rust boundary + // + // Solution: + // We implement a custom signal handling mechanism using Rust's ctrlc crate that: + // 1. Bypasses Python's signal handling entirely + // 2. Provides consistent behavior across platforms + // 3. Ensures graceful shutdown with proper exit codes + // Note: While eliminating the root cause of SIGINT handling conflicts would be ideal, + // the source appears to be deeply embedded in BAML's architecture and PyO3's runtime. + // A proper fix would require extensive changes to how BAML handles signals across the + // Python/Rust boundary. For now, this workaround provides reliable interrupt handling + // without requiring major architectural changes but welp, this is a hacky solution. + + // Create a channel for communicating between the signal handler and main thread + // This is necessary because signal handlers run in a separate context and + // need a safe way to communicate with the main program + let (interrupt_send, interrupt_recv) = std::sync::mpsc::channel(); + + // Install our custom Ctrl+C handler + // This will run in a separate thread when SIGINT is received + ctrlc::set_handler(move || { + println!("\nShutting Down BAML..."); + // Notify the main thread through the channel + // Using ok() to ignore send errors if the receiver is already dropped + interrupt_send.send(()).ok(); + }).expect("Error setting Ctrl-C handler"); + + // Monitor for interrupt signals in a separate thread + // This is necessary because we can't directly exit from the signal handler. + + std::thread::spawn(move || { + if interrupt_recv.recv().is_ok() { + // Exit with code 130 (128 + SIGINT's signal number 2) + // This is the standard Unix convention for processes terminated by SIGINT + std::process::exit(130); + } + }); + baml_cli::run_cli( py.import("sys")? .getattr("argv")?