diff --git a/engine/baml-runtime/src/constraints.rs b/engine/baml-runtime/src/constraints.rs index 68f770919..8e6e27f07 100644 --- a/engine/baml-runtime/src/constraints.rs +++ b/engine/baml-runtime/src/constraints.rs @@ -34,19 +34,54 @@ pub fn evaluate_test_constraints( /// The result of running a series of block-level constraints within a test. #[derive(Clone, Debug, PartialEq)] pub enum TestConstraintsResult { - /// All checks and asserts passed. - Passed, - - /// An assert failed. - Failed { reason: String }, - - /// At least one check failed to evaluate to true. - Partial { failed_checks: Vec }, + /// Constraint testing finished with the following check + /// results, and optionally a failing assert. + Completed { + checks: Vec<(String, bool)>, + failed_assert: Option, + }, /// There was a problem evaluating a constraint. InternalError { details: String }, } +/// State update helper functions. +impl TestConstraintsResult { + pub fn empty() -> Self { + TestConstraintsResult::Completed { + checks: Vec::new(), + failed_assert: None, + } + } + fn checks(self) -> Vec<(String, bool)> { + match self { + TestConstraintsResult::Completed { checks, .. } => checks, + _ => Vec::new(), + } + } + fn add_check_result(mut self, name: String, result: bool) -> Self { + match self { + TestConstraintsResult::Completed { mut checks, .. } => { + checks.push((name, result)); + TestConstraintsResult::Completed { + checks, + failed_assert: None, + } + } + _ => self, + } + } + fn fail_assert(self, name: Option) -> Self { + match self { + TestConstraintsResult::Completed { checks, .. } => TestConstraintsResult::Completed { + checks, + failed_assert: Some(name.unwrap_or("".to_string())), + }, + _ => self, + } + } +} + /// The state that we track as we iterate over constraints in the test block. struct Accumulator { pub result: TestConstraintsResult, @@ -56,7 +91,10 @@ struct Accumulator { impl Accumulator { pub fn new() -> Self { Accumulator { - result: TestConstraintsResult::Passed, + result: TestConstraintsResult::Completed { + checks: Vec::new(), + failed_assert: None, + }, check_results: Vec::new(), } } @@ -74,8 +112,13 @@ fn step_constraints( // Short-circuit if we have already had a hard failure. We can skip // the work in the rest of this function if we have already encountered // a hard failure. - let already_failed = matches!(acc.result, TestConstraintsResult::Failed { .. }) - || matches!(acc.result, TestConstraintsResult::InternalError { .. }); + let already_failed = matches!( + acc.result, + TestConstraintsResult::Completed { + failed_assert: Some(_), + .. + } + ) || matches!(acc.result, TestConstraintsResult::InternalError { .. }); if already_failed { return acc; } @@ -134,21 +177,18 @@ fn step_constraints( bool_result_or_internal_error, ) { // A check ran to completion and succeeded or failed - // (i.e. returned a bool). This updates both the checks context + // (i.e. returned a bool). This updates both the checks jinja context // and the status. (Check, Some(check_name), Ok(check_passed)) => { check_results.push((check_name.clone(), check_passed.into())); - let result = if check_passed { - acc.result - } else { - let mut new_failed_checks = match acc.result { - TestConstraintsResult::Partial { failed_checks } => failed_checks, - _ => Vec::new(), - }; - new_failed_checks.push(check_name); - TestConstraintsResult::Partial { - failed_checks: new_failed_checks, - } + let mut new_checks = match acc.result { + TestConstraintsResult::Completed { checks, .. } => checks, + _ => Vec::new(), + }; + new_checks.push((check_name, check_passed)); + let result = TestConstraintsResult::Completed { + checks: new_checks, + failed_assert: None, }; return Accumulator { result, @@ -181,11 +221,7 @@ fn step_constraints( // A failing assert is a hard error. (Assert, maybe_name, Ok(false)) => { - let reason = match maybe_name { - Some(name) => format!("Failed assert {name}."), - None => "Failed assert.".to_string(), - }; - let result = TestConstraintsResult::Failed { reason }; + let result = acc.result.fail_assert(maybe_name); return Accumulator { result, check_results, @@ -334,7 +370,13 @@ mod tests { #[test] fn basic_test_constraints() { let res = run_pipeline(&[mk_assert("has_kids", "_.result.kids|length > 0")]); - assert_eq!(res, TestConstraintsResult::Passed); + assert_eq!( + res, + TestConstraintsResult::Completed { + checks: vec![("has_kids".to_string(), true)], + failed_assert: None, + } + ); } #[test] @@ -344,7 +386,17 @@ mod tests { mk_check("not_too_many", "this.kids.length < 100"), mk_assert("both_pass", "_.checks.has_kids and _.checks.not_too_many"), ]); - assert_eq!(res, TestConstraintsResult::Passed); + assert_eq!( + res, + TestConstraintsResult::Completed { + checks: vec![ + ("has_kids".to_string(), true), + ("not_too_many".to_string(), true), + ("both_pass".to_string(), true), + ], + failed_assert: None + } + ); } #[test] @@ -358,8 +410,13 @@ mod tests { // a check, therefore it doesn't get a field in `checks`. assert_eq!( res, - TestConstraintsResult::Failed { - reason: "Failed assert both_pass.".to_string() + TestConstraintsResult::Completed { + checks: vec![ + ("has_kids".to_string(), true), + ("not_too_many".to_string(), true), + ("both_pass".to_string(), true), + ], + failed_assert: Some("both_pass.".to_string()) } ); } @@ -372,7 +429,18 @@ mod tests { mk_check("both_pass", "_.checks.has_kids and _.checks.not_too_many"), mk_assert("either_or", "_.checks.both_pass or _.latency_ms < 1000"), ]); - assert_eq!(res, TestConstraintsResult::Passed); + assert_eq!( + res, + TestConstraintsResult::Completed { + checks: vec![ + ("has_kids".to_string(), true), + ("not_too_many".to_string(), true), + ("both_pass".to_string(), true), + ("either_or".to_string(), true) + ], + failed_assert: None + } + ); } #[test] @@ -386,8 +454,15 @@ mod tests { ]); assert_eq!( res, - TestConstraintsResult::Partial { - failed_checks: vec!["no_kids".to_string(), "way_too_many".to_string()] + TestConstraintsResult::Completed { + checks: vec![ + ("has_kids".to_string(), true), + ("not_too_many".to_string(), true), + ("both_pass".to_string(), true), + ("no_kids".to_string(), false), + ("way_too_many".to_string(), false) + ], + failed_assert: None } ); } @@ -395,6 +470,7 @@ mod tests { #[test] fn test_internal_error() { let res = run_pipeline(&[mk_check("faulty", "__.result.kids|length > 0")]); + // This test fails because there is a typo: `__` (double underscore). assert!(matches!(res, TestConstraintsResult::InternalError { .. })); } } diff --git a/engine/baml-runtime/src/lib.rs b/engine/baml-runtime/src/lib.rs index 5d77f54fe..cb67f534b 100644 --- a/engine/baml-runtime/src/lib.rs +++ b/engine/baml-runtime/src/lib.rs @@ -250,15 +250,14 @@ impl BamlRuntime { LLMResponse::Success(complete_llm_response) => Ok(complete_llm_response), _ => Err(anyhow::anyhow!("LLM Response was not successful")), }?; - // web_sys::console::log_1(&format!("constraints: {constraints:?}").into()); let test_constraints_result = if constraints.is_empty() { - TestConstraintsResult::Passed + TestConstraintsResult::empty() } else { match val { Some(Ok(value)) => { evaluate_test_constraints(¶ms, &value, &complete_resp, constraints) } - _ => TestConstraintsResult::Passed, + _ => TestConstraintsResult::empty(), } }; // web_sys::console::log_1(&format!("test_constraints_result: {test_constraints_result:?}").into()); diff --git a/engine/baml-runtime/src/types/response.rs b/engine/baml-runtime/src/types/response.rs index 621f64218..a2fd5f45e 100644 --- a/engine/baml-runtime/src/types/response.rs +++ b/engine/baml-runtime/src/types/response.rs @@ -1,8 +1,8 @@ pub use crate::internal::llm_client::LLMResponse; use crate::{ + constraints::TestConstraintsResult, errors::ExposedError, internal::llm_client::{orchestrator::OrchestrationScope, ResponseBamlValue}, - constraints::TestConstraintsResult, }; use anyhow::Result; use colored::*; @@ -207,7 +207,9 @@ impl From> for BamlValue { fn from(status: TestStatus) -> Self { match status { TestStatus::Pass => BamlValue::String("pass".to_string()), - TestStatus::NeedsHumanEval(checks) => BamlValue::String(format!("checks need human evaluation: {:?}", checks)), + TestStatus::NeedsHumanEval(checks) => { + BamlValue::String(format!("checks need human evaluation: {:?}", checks)) + } TestStatus::Fail(r) => BamlValue::String(format!("failed! {:?}", r)), } } @@ -215,10 +217,15 @@ impl From> for BamlValue { #[derive(Debug)] pub enum TestFailReason<'a> { - TestUnspecified(&'a anyhow::Error), + TestUnspecified(anyhow::Error), TestLLMFailure(&'a LLMResponse), TestParseFailure(&'a anyhow::Error), - TestConstraintFailure(anyhow::Error), + TestConstraintsFailure { + checks: Vec<(String, bool)>, + failed_assert: Option, + }, + // TestCheckFailures(Vec), + // TestAssertFailure(String), } impl PartialEq for TestFailReason<'_> { @@ -242,10 +249,24 @@ impl TestResponse { if let Some(parsed) = func_res.result_with_constraints() { if parsed.is_ok() { match self.constraints_result.clone() { - TestConstraintsResult::Passed => TestStatus::Pass, - TestConstraintsResult::Failed { reason } => TestStatus::Fail( TestFailReason::TestConstraintFailure(anyhow::anyhow!(reason))), - TestConstraintsResult::Partial { failed_checks } => TestStatus::NeedsHumanEval( failed_checks ), - TestConstraintsResult::InternalError { details } => TestStatus::Fail( TestFailReason::TestConstraintFailure(anyhow::anyhow!(details))) + TestConstraintsResult::InternalError { details } => { + TestStatus::Fail(TestFailReason::TestUnspecified(anyhow::anyhow!(details))) + } + TestConstraintsResult::Completed { + checks, + failed_assert, + } => { + let n_failed_checks: usize = + checks.iter().filter(|(_, pass)| !pass).count(); + if failed_assert.is_some() || n_failed_checks > 0 { + TestStatus::Fail(TestFailReason::TestConstraintsFailure { + checks, + failed_assert, + }) + } else { + TestStatus::Pass + } + } } } else { TestStatus::Fail(TestFailReason::TestParseFailure( diff --git a/engine/baml-schema-wasm/src/runtime_wasm/mod.rs b/engine/baml-schema-wasm/src/runtime_wasm/mod.rs index 5836de97f..8cb970321 100644 --- a/engine/baml-schema-wasm/src/runtime_wasm/mod.rs +++ b/engine/baml-schema-wasm/src/runtime_wasm/mod.rs @@ -12,8 +12,7 @@ use baml_runtime::RenderCurlSettings; use baml_runtime::{ internal::llm_client::LLMResponse, BamlRuntime, DiagnosticsError, IRHelper, RenderedPrompt, }; -use baml_types::ResponseCheck; -use baml_types::{BamlMediaType, BamlValue, BamlValueWithMeta, GeneratorOutputType, TypeValue}; +use baml_types::{BamlMediaType, BamlValue, GeneratorOutputType, TypeValue}; use indexmap::IndexMap; use internal_baml_codegen::version_check::GeneratorType; use internal_baml_codegen::version_check::{check_version, VersionCheckMode}; @@ -597,7 +596,7 @@ impl WasmTestResponse { baml_runtime::TestFailReason::TestUnspecified(_) => TestStatus::UnableToRun, baml_runtime::TestFailReason::TestLLMFailure(_) => TestStatus::LLMFailure, baml_runtime::TestFailReason::TestParseFailure(_) => TestStatus::ParseFailure, - baml_runtime::TestFailReason::TestConstraintFailure(_) => { + baml_runtime::TestFailReason::TestConstraintsFailure { .. } => { TestStatus::ConstraintsFailed } }, @@ -779,7 +778,23 @@ impl WithRenderError for baml_runtime::TestFailReason<'_> { baml_runtime::TestFailReason::TestUnspecified(e) => Some(format!("{e:#}")), baml_runtime::TestFailReason::TestLLMFailure(f) => f.render_error(), baml_runtime::TestFailReason::TestParseFailure(e) => Some(format!("{e:#}")), - baml_runtime::TestFailReason::TestConstraintFailure(e) => Some(format!("{e:#}")), + baml_runtime::TestFailReason::TestConstraintsFailure { + checks, + failed_assert, + } => { + let checks_msg = if checks.len() > 0 { + let check_msgs = checks.into_iter().map(|(name, pass)| { + format!("{name}: {}", if *pass { "Passed" } else { "Failed" }) + }); + format!("Check results:\n{}", join(check_msgs, "\n")) + } else { + String::new() + }; + let assert_msg = failed_assert + .as_ref() + .map_or("".to_string(), |name| format!("\nFailed assert: {name}")); + Some(format!("{checks_msg}{assert_msg}")) + } } } } diff --git a/typescript/playground-common/src/baml_wasm_web/test_uis/testHooks.ts b/typescript/playground-common/src/baml_wasm_web/test_uis/testHooks.ts index ce52bab6d..7c6c34b89 100644 --- a/typescript/playground-common/src/baml_wasm_web/test_uis/testHooks.ts +++ b/typescript/playground-common/src/baml_wasm_web/test_uis/testHooks.ts @@ -50,6 +50,15 @@ export const statusCountAtom = atom({ error: 0, }) +/// This atom will track the state of the full test suite. +/// 'unknown` means tests haven't been run yet. `pass` means +/// they have all run to completion. +/// 'warn' means at least one check has failed, and `fail` +/// means at least one assert has failed, or an internal error +/// occurred. +export type TestSuiteSummary = 'pass' | 'warn' | 'fail' | 'unknown' +export const testSuiteSummary = atom('unknown') + export const useRunHooks = () => { const isRunning = useAtomValue(isRunningAtom) @@ -69,6 +78,7 @@ export const useRunHooks = () => { } set(isRunningAtom, true) set(showTestsAtom, true) + set(testSuiteSummary,'unknown') vscode.postMessage({ command: 'telemetry', @@ -146,7 +156,7 @@ export const useRunHooks = () => { const { res, elapsed } = result.value // console.log('result', i, result.value.res.llm_response(), 'batch[i]', batch[i]) - let status = res.status() + let status: Number = res.status() let response_status: DoneTestStatusType = 'error' if (status === 0) { response_status = 'passed'