From cafd2ea35ac0d3129ddddb7c4fc81561a7316657 Mon Sep 17 00:00:00 2001
From: Greg Hale <ImAlsoGreg@gmail.com>
Date: Fri, 22 Nov 2024 16:38:49 -0800
Subject: [PATCH] Add constraints to test blocks. (#1185)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR extends the syntax for test blocks with constraints (checks and
asserts):

```baml
test SomeTest {
  functions [Succ]
  args {
     x 1
  }
  @@assert( {{ this == 2 }} )
}
```

Testing done:
 - [x] Unit tests for parsing and interpreting test-level constraints.
 - [x] Integ tests still pass, but none were added for this feature.
 - [x] Manual testing of checks & asserts in vscode extension.

## Details

`test` blocks in BAML code may now contain checks and asserts. A
slightly different set of variables are available in the context of the
jinja expressions a user can write in a test, compared to the
constraints a user would place on types:

 - The `_` variable contains fields `result`, `checks` and `latency_ms`.
 - The `this` variable refers to the value computed by the test.
- In a given constraint, `_.checks.NAME` can refer to the NAME of any
earlier check that
    was run in the same test block.

The UI has been updated to reflect the results of test-level
constraints. Failing asserts result in a test error, and failing
constraints result in a message indicating that user intervention is
required to assess the response.

## Example screenshots

<img width="902" alt="Screenshot 2024-11-20 at 3 23 46 PM"
src="https://github.com/user-attachments/assets/952751bc-1b7f-4978-ad06-0639c4269ba0">

This example shows three checks that use different builtin variables in
their predicate functions, and an assert that refers to the previous
checks.

---

<img width="932" alt="Screenshot 2024-11-20 at 3 29 32 PM"
src="https://github.com/user-attachments/assets/beb5d296-fe10-4227-a0c0-20f3f5ff6f92">

This example shows how a failing assert is rendered. The failing assert
is the result of asserting the status of a prior check, named `fast`,
which failed.

---


<img width="534" alt="Screenshot 2024-11-20 at 3 31 07 PM"
src="https://github.com/user-attachments/assets/364d79dc-78f7-4f6b-8cc2-741b14b2b659">

Jinja expressions that try to reference nonexistent checks, or checks
that are defined later in the test, raise compiler warnings.

---
<img width="407" alt="Screenshot 2024-11-20 at 3 46 02 PM"
src="https://github.com/user-attachments/assets/5b06e88c-8348-4e17-8ca1-4c41ba587f5f">

Function arguments are available in jinja expressions, and are visible
to the
static analyzer, so that warnings can be raised when attempting to use a
nonexistent argument.

---
<img width="909" alt="Screenshot 2024-11-22 at 4 17 36 PM"
src="https://github.com/user-attachments/assets/d110d6a6-0b75-4663-a744-d8cd82de59ad">

This screenshot shows the provisional UI for when some subset of the
constraints fails.
- When at least one constraint fails, all checks and their status are
rendered.
- Any time tests are run, the status of the whole test suite is
indicated with an icon (in this case, the yellow warning sign)

<!-- ELLIPSIS_HIDDEN -->


----

> [!IMPORTANT]
> This PR adds constraints to BAML test blocks, updates the runtime for
constraint evaluation, and enhances the UI to display results.
>
>   - **Behavior**:
>     - Adds constraints (`check` and `assert`) to BAML `test` blocks.
> - Constraints use variables like `this`, `_.result`, and `_.checks`.
> - Failing asserts cause test errors; failing checks require user
intervention.
>   - **Validation**:
>     - Adds validation for constraints in `tests.rs`.
> - Ensures constraints are correctly parsed and validated in
`constraint.rs`.
>   - **Runtime**:
> - Implements `evaluate_test_constraints` in `constraints.rs` to
process constraints.
>     - Updates `orchestrate` functions to handle constraint evaluation.
>   - **UI**:
> - Updates test status handling in `testHooks.ts` and `test_result.tsx`
to include `constraints_failed` status.
>     - Adds UI elements to display constraint evaluation results.
>   - **Misc**:
>     - Updates `Cargo.toml` files to include necessary dependencies.
>     - Adds tests for constraint evaluation in `constraints.rs`.
>
> <sup>This description was created by </sup>[<img alt="Ellipsis"
src="https://img.shields.io/badge/Ellipsis-blue?color=175173">](https://www.ellipsis.dev?ref=BoundaryML%2Fbaml&utm_source=github&utm_medium=referral)<sup>
for 49e312dc4f298175cb746afcaf1e35f9b72d2dd8. It will automatically
update as commits are pushed.</sup>


<!-- ELLIPSIS_HIDDEN -->
---
 engine/Cargo.lock                             |   2 +
 .../baml-core/src/ir/jinja_helpers.rs         |  24 +-
 engine/baml-lib/baml-core/src/ir/repr.rs      |  89 +++-
 engine/baml-lib/baml-core/src/ir/walker.rs    |   4 +-
 .../validation_pipeline/validations.rs        |   2 +
 .../validation_pipeline/validations/tests.rs  |  94 ++++
 engine/baml-lib/baml-types/src/constraint.rs  |   2 +-
 .../src/attributes/constraint.rs              |  63 ++-
 .../parser-database/src/attributes/mod.rs     |   2 +-
 engine/baml-lib/parser-database/src/tarjan.rs |   3 +
 .../src/types/configurations.rs               |  23 +-
 .../baml-lib/parser-database/src/types/mod.rs |   2 +
 .../src/ast/value_expression_block.rs         |   2 +-
 .../schema-ast/src/parser/datamodel.pest      |   2 +-
 .../parser/parse_value_expression_block.rs    |  33 +-
 engine/baml-runtime/Cargo.toml                |   1 +
 engine/baml-runtime/src/constraints.rs        | 469 ++++++++++++++++++
 .../src/internal/llm_client/mod.rs            |   6 +-
 .../internal/llm_client/orchestrator/call.rs  |   2 +-
 .../llm_client/orchestrator/stream.rs         |   4 +-
 engine/baml-runtime/src/lib.rs                |  93 ++--
 .../src/runtime/runtime_interface.rs          |  10 +-
 engine/baml-runtime/src/runtime_interface.rs  |   9 +-
 engine/baml-runtime/src/types/response.rs     |  34 +-
 engine/baml-schema-wasm/Cargo.toml            |   1 +
 .../baml-schema-wasm/src/runtime_wasm/mod.rs  |  55 +-
 .../src/baml_wasm_web/test_uis/testHooks.ts   |  59 ++-
 .../baml_wasm_web/test_uis/test_result.tsx    |  40 +-
 28 files changed, 1035 insertions(+), 95 deletions(-)
 create mode 100644 engine/baml-lib/baml-core/src/validate/validation_pipeline/validations/tests.rs
 create mode 100644 engine/baml-runtime/src/constraints.rs
diff --git a/engine/Cargo.lock b/engine/Cargo.lock
index 8414efd68..02a170e0f 100644
--- a/engine/Cargo.lock
+++ b/engine/Cargo.lock
@@ -955,6 +955,7 @@ dependencies = [
  "log",
  "mime",
  "mime_guess",
+ "minijinja",
  "notify-debouncer-full",
  "pin-project-lite",
  "pretty_assertions",
@@ -1010,6 +1011,7 @@ dependencies = [
  "indoc",
  "internal-baml-codegen",
  "internal-baml-core",
+ "itertools 0.13.0",
  "js-sys",
  "jsonish",
  "log",
diff --git a/engine/baml-lib/baml-core/src/ir/jinja_helpers.rs b/engine/baml-lib/baml-core/src/ir/jinja_helpers.rs
index b5d3a636c..56a46b7d1 100644
--- a/engine/baml-lib/baml-core/src/ir/jinja_helpers.rs
+++ b/engine/baml-lib/baml-core/src/ir/jinja_helpers.rs
@@ -50,7 +50,7 @@ fn sum_filter(value: Vec<Value>) -> Value {
 /// E.g. `"a|length > 2"` with context `{"a": [1, 2, 3]}` will return `"true"`.
 pub fn render_expression(
     expression: &JinjaExpression,
-    ctx: &HashMap<String, BamlValue>,
+    ctx: &HashMap<String, minijinja::Value>,
 ) -> anyhow::Result<String> {
     let env = get_env();
     // In rust string literals, `{` is escaped as `{{`.
@@ -66,8 +66,8 @@ pub fn evaluate_predicate(
     this: &BamlValue,
     predicate_expression: &JinjaExpression,
 ) -> Result<bool, anyhow::Error> {
-    let ctx: HashMap<String, BamlValue> =
-        [("this".to_string(), this.clone())].into_iter().collect();
+    let ctx: HashMap<String, minijinja::Value> =
+        HashMap::from([("this".to_string(), minijinja::Value::from_serialize(this))]);
     match render_expression(&predicate_expression, &ctx)?.as_ref() {
         "true" => Ok(true),
         "false" => Ok(false),
@@ -87,11 +87,12 @@ mod tests {
                 "a".to_string(),
                 BamlValue::List(
                     vec![BamlValue::Int(1), BamlValue::Int(2), BamlValue::Int(3)].into(),
-                ),
+                )
+                .into(),
             ),
             (
                 "b".to_string(),
-                BamlValue::String("(123)456-7890".to_string()),
+                BamlValue::String("(123)456-7890".to_string()).into(),
             ),
         ]
         .into_iter()
@@ -118,11 +119,12 @@ mod tests {
                 "a".to_string(),
                 BamlValue::List(
                     vec![BamlValue::Int(1), BamlValue::Int(2), BamlValue::Int(3)].into(),
-                ),
+                )
+                .into(),
             ),
             (
                 "b".to_string(),
-                BamlValue::String("(123)456-7890".to_string()),
+                BamlValue::String("(123)456-7890".to_string()).into(),
             ),
         ]
         .into_iter()
@@ -151,16 +153,12 @@ mod tests {
     fn test_sum_filter() {
         let ctx = vec![].into_iter().collect();
         assert_eq!(
-            render_expression(&JinjaExpression(
-                r#"[1,2]|sum"#.to_string()
-            ), &ctx).unwrap(),
+            render_expression(&JinjaExpression(r#"[1,2]|sum"#.to_string()), &ctx).unwrap(),
             "3"
         );
 
         assert_eq!(
-            render_expression(&JinjaExpression(
-                r#"[1,2.5]|sum"#.to_string()
-            ), &ctx).unwrap(),
+            render_expression(&JinjaExpression(r#"[1,2.5]|sum"#.to_string()), &ctx).unwrap(),
             "3.5"
         );
     }
diff --git a/engine/baml-lib/baml-core/src/ir/repr.rs b/engine/baml-lib/baml-core/src/ir/repr.rs
index 3d2db25ed..fb6d71fa1 100644
--- a/engine/baml-lib/baml-core/src/ir/repr.rs
+++ b/engine/baml-lib/baml-core/src/ir/repr.rs
@@ -8,10 +8,11 @@ use internal_baml_parser_database::{
     walkers::{
         ClassWalker, ClientSpec as AstClientSpec, ClientWalker, ConfigurationWalker,
         EnumValueWalker, EnumWalker, FieldWalker, FunctionWalker, TemplateStringWalker,
+        Walker as AstWalker,
     },
     Attributes, ParserDatabase, PromptAst, RetryPolicyStrategy,
 };
-use internal_baml_schema_ast::ast::SubType;
+use internal_baml_schema_ast::ast::{SubType, ValExpId};
 
 use baml_types::JinjaExpression;
 use internal_baml_schema_ast::ast::{self, FieldArity, WithName, WithSpan};
@@ -676,8 +677,14 @@ impl WithRepr<Enum> for EnumWalker<'_> {
     fn repr(&self, db: &ParserDatabase) -> Result<Enum> {
         Ok(Enum {
             name: self.name().to_string(),
-            values: self.values().map(|w| (w.node(db).map(|v| (v, w.documentation().map(|s| Docstring(s.to_string())))))).collect::<Result<Vec<_>,_>>()?,
-            docstring: self.get_documentation().map(|s| Docstring(s))
+            values: self
+                .values()
+                .map(|w| {
+                    w.node(db)
+                        .map(|v| (v, w.documentation().map(|s| Docstring(s.to_string()))))
+                })
+                .collect::<Result<Vec<_>, _>>()?,
+            docstring: self.get_documentation().map(|s| Docstring(s)),
         })
     }
 }
@@ -722,7 +729,6 @@ impl WithRepr<Field> for FieldWalker<'_> {
             docstring: self.get_documentation().map(|s| Docstring(s)),
         })
     }
-
 }
 
 type ClassId = String;
@@ -774,7 +780,7 @@ impl WithRepr<Class> for ClassWalker<'_> {
                     .collect::<Result<Vec<_>>>()?,
                 None => Vec::new(),
             },
-            docstring: self.get_documentation().map(|s| Docstring(s))
+            docstring: self.get_documentation().map(|s| Docstring(s)),
         })
     }
 }
@@ -1110,14 +1116,23 @@ pub struct TestCase {
     pub name: String,
     pub functions: Vec<Node<TestCaseFunction>>,
     pub args: IndexMap<String, Expression>,
+    pub constraints: Vec<Constraint>,
 }
 
 impl WithRepr<TestCaseFunction> for (&ConfigurationWalker<'_>, usize) {
     fn attributes(&self, _db: &ParserDatabase) -> NodeAttributes {
         let span = self.0.test_case().functions[self.1].1.clone();
+        let constraints = self
+            .0
+            .test_case()
+            .constraints
+            .iter()
+            .map(|(c, _, _)| c)
+            .cloned()
+            .collect();
         NodeAttributes {
             meta: IndexMap::new(),
-            constraints: Vec::new(),
+            constraints,
             span: Some(span),
         }
     }
@@ -1131,10 +1146,17 @@ impl WithRepr<TestCaseFunction> for (&ConfigurationWalker<'_>, usize) {
 
 impl WithRepr<TestCase> for ConfigurationWalker<'_> {
     fn attributes(&self, _db: &ParserDatabase) -> NodeAttributes {
+        let constraints = self
+            .test_case()
+            .constraints
+            .iter()
+            .map(|(c, _, _)| c)
+            .cloned()
+            .collect();
         NodeAttributes {
             meta: IndexMap::new(),
             span: Some(self.span().clone()),
-            constraints: Vec::new(),
+            constraints,
         }
     }
 
@@ -1151,6 +1173,12 @@ impl WithRepr<TestCase> for ConfigurationWalker<'_> {
                 .map(|(k, (_, v))| Ok((k.clone(), v.repr(db)?)))
                 .collect::<Result<IndexMap<_, _>>>()?,
             functions,
+            constraints: <AstWalker<'_, (ValExpId, &str)> as WithRepr<TestCase>>::attributes(
+                self, db,
+            )
+            .constraints
+            .into_iter()
+            .collect::<Vec<_>>(),
         })
     }
 }
@@ -1223,7 +1251,8 @@ mod tests {
 
     #[test]
     fn test_docstrings() {
-        let ir = make_test_ir(r#"
+        let ir = make_test_ir(
+            r#"
           /// Foo class.
           class Foo {
             /// Bar field.
@@ -1243,7 +1272,9 @@ mod tests {
 
             THIRD
           }
-        "#).unwrap();
+        "#,
+        )
+        .unwrap();
 
         // Test class docstrings
         let foo = ir.find_class("Foo").as_ref().unwrap().clone().elem();
@@ -1252,7 +1283,7 @@ mod tests {
             [field1, field2] => {
                 assert_eq!(field1.elem.docstring.as_ref().unwrap().0, "Bar field.");
                 assert_eq!(field2.elem.docstring.as_ref().unwrap().0, "Baz field.");
-            },
+            }
             _ => {
                 panic!("Expected 2 fields");
             }
@@ -1260,7 +1291,10 @@ mod tests {
 
         // Test enum docstrings
         let test_enum = ir.find_enum("TestEnum").as_ref().unwrap().clone().elem();
-        assert_eq!(test_enum.docstring.as_ref().unwrap().0.as_str(), "Test enum.");
+        assert_eq!(
+            test_enum.docstring.as_ref().unwrap().0.as_str(),
+            "Test enum."
+        );
         match test_enum.values.as_slice() {
             [val1, val2, val3] => {
                 assert_eq!(val1.0.elem.0, "FIRST");
@@ -1269,10 +1303,41 @@ mod tests {
                 assert_eq!(val2.1.as_ref().unwrap().0, "Second variant.");
                 assert_eq!(val3.0.elem.0, "THIRD");
                 assert!(val3.1.is_none());
-            },
+            }
             _ => {
                 panic!("Expected 3 enum values");
             }
         }
     }
+
+    #[test]
+    fn test_block_attributes() {
+        let ir = make_test_ir(
+            r##"
+            client<llm> GPT4 {
+              provider openai
+              options {
+                model gpt-4o
+                api_key env.OPENAI_API_KEY
+              }
+            }
+            function Foo(a: int) -> int {
+              client GPT4
+              prompt #"Double the number {{ a }}"#
+            }
+
+            test Foo() {
+              functions [Foo]
+              args {
+                a 10
+              }
+              @@assert( {{ result == 20 }} )
+            }
+        "##,
+        )
+        .unwrap();
+        let function = ir.find_function("Foo").unwrap();
+        let walker = ir.find_test(&function, "Foo").unwrap();
+        assert_eq!(walker.item.1.elem.constraints.len(), 1);
+    }
 }
diff --git a/engine/baml-lib/baml-core/src/ir/walker.rs b/engine/baml-lib/baml-core/src/ir/walker.rs
index 034bf2e3a..86e09bc61 100644
--- a/engine/baml-lib/baml-core/src/ir/walker.rs
+++ b/engine/baml-lib/baml-core/src/ir/walker.rs
@@ -260,9 +260,9 @@ impl Expression {
             }
             Expression::JinjaExpression(expr) => {
                 // TODO: do not coerce all context values to strings.
-                let jinja_context: HashMap<String, BamlValue> = env_values
+                let jinja_context: HashMap<String, minijinja::Value> = env_values
                     .iter()
-                    .map(|(k, v)| (k.clone(), BamlValue::String(v.clone())))
+                    .map(|(k, v)| (k.clone(), v.clone().into()))
                     .collect();
                 let res_string = render_expression(&expr, &jinja_context)?;
                 Ok(BamlValue::String(res_string))
diff --git a/engine/baml-lib/baml-core/src/validate/validation_pipeline/validations.rs b/engine/baml-lib/baml-core/src/validate/validation_pipeline/validations.rs
index c5c6acb3f..4dfc72aef 100644
--- a/engine/baml-lib/baml-core/src/validate/validation_pipeline/validations.rs
+++ b/engine/baml-lib/baml-core/src/validate/validation_pipeline/validations.rs
@@ -5,6 +5,7 @@ mod cycle;
 mod enums;
 mod functions;
 mod template_strings;
+mod tests;
 mod types;
 
 use baml_types::GeneratorOutputType;
@@ -22,6 +23,7 @@ pub(super) fn validate(ctx: &mut Context<'_>) {
     clients::validate(ctx);
     template_strings::validate(ctx);
     configurations::validate(ctx);
+    tests::validate(ctx);
 
     let generators = load_generators_from_ast(ctx.db.ast(), ctx.diagnostics);
     let codegen_targets: HashSet<GeneratorOutputType> = generators.into_iter().filter_map(|generator| match generator {
diff --git a/engine/baml-lib/baml-core/src/validate/validation_pipeline/validations/tests.rs b/engine/baml-lib/baml-core/src/validate/validation_pipeline/validations/tests.rs
new file mode 100644
index 000000000..288f43992
--- /dev/null
+++ b/engine/baml-lib/baml-core/src/validate/validation_pipeline/validations/tests.rs
@@ -0,0 +1,94 @@
+use baml_types::{Constraint, ConstraintLevel};
+use internal_baml_diagnostics::{DatamodelError, DatamodelWarning, Span};
+use internal_baml_jinja_types::{validate_expression, JinjaContext, PredefinedTypes, Type};
+
+use crate::validate::validation_pipeline::context::Context;
+
+pub(super) fn validate(ctx: &mut Context<'_>) {
+    let tests = ctx.db.walk_test_cases().collect::<Vec<_>>();
+    tests.iter().for_each(|walker| {
+        let constraints = &walker.test_case().constraints;
+        let args = &walker.test_case().args;
+        let mut check_names: Vec<String> = Vec::new();
+        for (
+            Constraint {
+                label,
+                level,
+                expression,
+            },
+            constraint_span,
+            expr_span,
+        ) in constraints.iter()
+        {
+            let mut defined_types = PredefinedTypes::default(JinjaContext::Parsing);
+            defined_types.add_variable("this", Type::Unknown);
+            defined_types.add_class(
+                "Checks",
+                check_names
+                    .iter()
+                    .map(|check_name| (check_name.clone(), Type::Unknown))
+                    .collect(),
+            );
+            defined_types.add_class(
+                "_",
+                vec![
+                    ("checks".to_string(), Type::ClassRef("Checks".to_string())),
+                    ("result".to_string(), Type::Unknown),
+                    ("latency_ms".to_string(), Type::Number),
+                ]
+                .into_iter()
+                .collect(),
+            );
+            defined_types.add_variable("_", Type::ClassRef("_".to_string()));
+            args.keys()
+                .for_each(|arg_name| defined_types.add_variable(arg_name, Type::Unknown));
+            match (level, label) {
+                (ConstraintLevel::Check, Some(check_name)) => {
+                    check_names.push(check_name.to_string());
+                }
+                _ => {}
+            }
+            match validate_expression(expression.0.as_str(), &mut defined_types) {
+                Ok(_) => {}
+                Err(e) => {
+                    if let Some(e) = e.parsing_errors {
+                        let range = match e.range() {
+                            Some(range) => range,
+                            None => {
+                                ctx.push_error(DatamodelError::new_validation_error(
+                                    &format!("Error parsing jinja template: {}", e),
+                                    expr_span.clone(),
+                                ));
+                                continue;
+                            }
+                        };
+
+                        let start_offset = expr_span.start + range.start;
+                        let end_offset = expr_span.start + range.end;
+
+                        let span = Span::new(
+                            expr_span.file.clone(),
+                            start_offset as usize,
+                            end_offset as usize,
+                        );
+
+                        ctx.push_error(DatamodelError::new_validation_error(
+                            &format!("Error parsing jinja template: {}", e),
+                            span,
+                        ))
+                    } else {
+                        e.errors.iter().for_each(|t| {
+                            let tspan = t.span();
+                            let span = Span::new(
+                                expr_span.file.clone(),
+                                expr_span.start + tspan.start_offset as usize,
+                                expr_span.start + tspan.end_offset as usize,
+                            );
+                            ctx.push_warning(DatamodelWarning::new(t.message().to_string(), span))
+                        })
+                    }
+                }
+            }
+        }
+    });
+}
diff --git a/engine/baml-lib/baml-types/src/constraint.rs b/engine/baml-lib/baml-types/src/constraint.rs
index 16abad0a3..372125d7f 100644
--- a/engine/baml-lib/baml-types/src/constraint.rs
+++ b/engine/baml-lib/baml-types/src/constraint.rs
@@ -27,7 +27,7 @@ pub enum ConstraintLevel {
 }
 
 /// The user-visible schema for a failed check.
-#[derive(Clone, Debug, serde::Serialize)]
+#[derive(Clone, Debug, serde::Serialize, PartialEq, Eq)]
 pub struct ResponseCheck {
     pub name: String,
     pub expression: String,
diff --git a/engine/baml-lib/parser-database/src/attributes/constraint.rs b/engine/baml-lib/parser-database/src/attributes/constraint.rs
index c54a921d5..75ca839bd 100644
--- a/engine/baml-lib/parser-database/src/attributes/constraint.rs
+++ b/engine/baml-lib/parser-database/src/attributes/constraint.rs
@@ -1,9 +1,70 @@
 use baml_types::{Constraint, ConstraintLevel};
 use internal_baml_diagnostics::{DatamodelError, Span};
-use internal_baml_schema_ast::ast::{Attribute, Expression};
+use internal_baml_schema_ast::ast::{Argument, Attribute, Expression};
 
 use crate::{context::Context, types::Attributes};
 
+/// Interpret an attribute as a constraint, the whole constraint's span,
+/// and the span of the constraint's jinja expression.
+pub fn attribute_as_constraint(
+    attribute: &Attribute,
+) -> (Option<(Constraint, Span, Span)>, Vec<DatamodelError>) {
+    let span = attribute.span.clone();
+    let mut datamodel_errors = Vec::new();
+    let attribute_name = attribute.name.to_string();
+    let arguments: Vec<Expression> = attribute
+        .arguments
+        .arguments
+        .iter()
+        .map(|Argument { value, .. }| value)
+        .cloned()
+        .collect();
+
+    let level = match attribute_name.as_str() {
+        "assert" => ConstraintLevel::Assert,
+        "check" => ConstraintLevel::Check,
+        _ => {
+            return (None, datamodel_errors);
+        }
+    };
+
+    let (label, expression, expr_span) = match arguments.as_slice() {
+        [Expression::JinjaExpressionValue(expression, expr_span)] => {
+            if level == ConstraintLevel::Check {
+                datamodel_errors.push(DatamodelError::new_attribute_validation_error(
+                    "Checks must specify a label.",
+                    attribute_name.as_str(),
+                    span.clone(),
+                ));
+            }
+            (None, expression.clone(), expr_span.clone())
+        }
+        [Expression::Identifier(label), Expression::JinjaExpressionValue(expression, expr_span)] => {
+            (
+                Some(label.to_string()),
+                expression.clone(),
+                expr_span.clone(),
+            )
+        }
+        _ => {
+            datamodel_errors.push(
+                DatamodelError::new_attribute_validation_error(
+                    "Checks and asserts may have either a label and an expression, or a lone expression.",
+                    attribute_name.as_str(),
+                    span
+                )
+            );
+            return (None, datamodel_errors);
+        }
+    };
+    let constraint = Constraint {
+        label,
+        expression,
+        level,
+    };
+    (Some((constraint, span, expr_span)), datamodel_errors)
+}
+
 pub(super) fn visit_constraint_attributes(
     attribute_name: String,
     span: Span,
diff --git a/engine/baml-lib/parser-database/src/attributes/mod.rs b/engine/baml-lib/parser-database/src/attributes/mod.rs
index 95daebcba..7d27531fc 100644
--- a/engine/baml-lib/parser-database/src/attributes/mod.rs
+++ b/engine/baml-lib/parser-database/src/attributes/mod.rs
@@ -1,7 +1,7 @@
 use internal_baml_schema_ast::ast::{Top, TopId, TypeExpId, TypeExpressionBlock};
 
 mod alias;
-mod constraint;
+pub mod constraint;
 mod description;
 mod to_string_attribute;
 use crate::interner::StringId;
diff --git a/engine/baml-lib/parser-database/src/tarjan.rs b/engine/baml-lib/parser-database/src/tarjan.rs
index e3559f39e..935969e95 100644
--- a/engine/baml-lib/parser-database/src/tarjan.rs
+++ b/engine/baml-lib/parser-database/src/tarjan.rs
@@ -1,4 +1,7 @@
 //! Tarjan's strongly connected components algorithm for cycle detection.
+//!
+//! This is used in parser_database to detect cycles in BAML types
+//! that reference each other recursively.
 
 use std::{
     cmp,
diff --git a/engine/baml-lib/parser-database/src/types/configurations.rs b/engine/baml-lib/parser-database/src/types/configurations.rs
index 95f88c71c..6b09b7b4e 100644
--- a/engine/baml-lib/parser-database/src/types/configurations.rs
+++ b/engine/baml-lib/parser-database/src/types/configurations.rs
@@ -1,11 +1,17 @@
+use baml_types::Constraint;
 use internal_baml_diagnostics::{DatamodelError, DatamodelWarning, Span};
-use internal_baml_schema_ast::ast::{ValExpId, ValueExprBlock, WithIdentifier, WithName, WithSpan};
+use internal_baml_schema_ast::ast::{
+    Attribute, ValExpId, ValueExprBlock, WithIdentifier, WithName, WithSpan,
+};
 use regex::Regex;
 use std::collections::HashSet;
 
+use crate::attributes::constraint::attribute_as_constraint;
 use crate::{coerce, coerce_array, coerce_expression::coerce_map, context::Context};
 
-use super::{ContantDelayStrategy, ExponentialBackoffStrategy, RetryPolicy, RetryPolicyStrategy};
+use super::{
+    Attributes, ContantDelayStrategy, ExponentialBackoffStrategy, RetryPolicy, RetryPolicyStrategy,
+};
 
 fn dedent(s: &str) -> String {
     // Find the shortest indentation in the string (that's not an empty line).
@@ -288,6 +294,18 @@ pub(crate) fn visit_test_case<'db>(
             )),
         });
 
+    let constraints: Vec<(Constraint, Span, Span)> = config
+        .attributes
+        .iter()
+        .filter_map(|attribute| {
+            let (maybe_constraint, errors) = attribute_as_constraint(attribute);
+            for error in errors {
+                ctx.push_error(error);
+            }
+            maybe_constraint
+        })
+        .collect();
+
     match (functions, args) {
         (None, _) => ctx.push_error(DatamodelError::new_validation_error(
             "Missing `functions` property",
@@ -304,6 +322,7 @@ pub(crate) fn visit_test_case<'db>(
                     functions,
                     args,
                     args_field_span: args_field_span.clone(),
+                    constraints,
                 },
             );
         }
diff --git a/engine/baml-lib/parser-database/src/types/mod.rs b/engine/baml-lib/parser-database/src/types/mod.rs
index ddb13b4ea..9cfb09664 100644
--- a/engine/baml-lib/parser-database/src/types/mod.rs
+++ b/engine/baml-lib/parser-database/src/types/mod.rs
@@ -5,6 +5,7 @@ use crate::coerce;
 use crate::types::configurations::visit_test_case;
 use crate::{context::Context, DatamodelError};
 
+use baml_types::Constraint;
 use indexmap::IndexMap;
 use internal_baml_diagnostics::Span;
 use internal_baml_prompt_parser::ast::{ChatBlock, PrinterBlock, Variable};
@@ -138,6 +139,7 @@ pub struct TestCase {
     // The span is the span of the argument (the expression has its own span)
     pub args: IndexMap<String, (Span, Expression)>,
     pub args_field_span: Span,
+    pub constraints: Vec<(Constraint, Span, Span)>,
 }
 
 #[derive(Debug, Clone)]
diff --git a/engine/baml-lib/schema-ast/src/ast/value_expression_block.rs b/engine/baml-lib/schema-ast/src/ast/value_expression_block.rs
index ccebc25fa..7619cc12f 100644
--- a/engine/baml-lib/schema-ast/src/ast/value_expression_block.rs
+++ b/engine/baml-lib/schema-ast/src/ast/value_expression_block.rs
@@ -64,7 +64,7 @@ pub struct BlockArgs {
     pub(crate) span: Span,
 }
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq)]
 pub enum ValueExprBlockType {
     Function,
     Client,
diff --git a/engine/baml-lib/schema-ast/src/parser/datamodel.pest b/engine/baml-lib/schema-ast/src/parser/datamodel.pest
index 3373c5024..f0c3ec5c4 100644
--- a/engine/baml-lib/schema-ast/src/parser/datamodel.pest
+++ b/engine/baml-lib/schema-ast/src/parser/datamodel.pest
@@ -23,7 +23,7 @@ field_type_with_attr = { field_type ~ (NEWLINE? ~ (field_attribute | trailing_co
 value_expression_keyword  = { FUNCTION_KEYWORD | TEST_KEYWORD | CLIENT_KEYWORD | RETRY_POLICY_KEYWORD | GENERATOR_KEYWORD }
 value_expression_block    = { value_expression_keyword ~ identifier ~ named_argument_list? ~ ARROW? ~ field_type_chain? ~ SPACER_TEXT ~ BLOCK_OPEN ~ value_expression_contents ~ BLOCK_CLOSE }
 value_expression_contents = {
-    (value_expression | comment_block | empty_lines | BLOCK_LEVEL_CATCH_ALL)*
+    (value_expression | comment_block | block_attribute | empty_lines | BLOCK_LEVEL_CATCH_ALL)*
 }
 value_expression          = { identifier ~ expression? ~ (NEWLINE? ~ field_attribute)* ~ trailing_comment? }
 
diff --git a/engine/baml-lib/schema-ast/src/parser/parse_value_expression_block.rs b/engine/baml-lib/schema-ast/src/parser/parse_value_expression_block.rs
index c4544cc9e..bd1341de8 100644
--- a/engine/baml-lib/schema-ast/src/parser/parse_value_expression_block.rs
+++ b/engine/baml-lib/schema-ast/src/parser/parse_value_expression_block.rs
@@ -1,10 +1,5 @@
 use super::{
-    helpers::{parsing_catch_all, Pair},
-    parse_comments::*,
-    parse_field::parse_value_expr,
-    parse_identifier::parse_identifier,
-    parse_named_args_list::{parse_function_arg, parse_named_argument_list},
-    Rule,
+    helpers::{parsing_catch_all, Pair}, parse_attribute::parse_attribute, parse_comments::*, parse_field::parse_value_expr, parse_identifier::parse_identifier, parse_named_args_list::{parse_function_arg, parse_named_argument_list}, Rule
 };
 
 use crate::ast::*;
@@ -17,7 +12,7 @@ pub(crate) fn parse_value_expression_block(
 ) -> Result<ValueExprBlock, DatamodelError> {
     let pair_span = pair.as_span();
     let mut name: Option<Identifier> = None;
-    let attributes: Vec<Attribute> = Vec::new();
+    let mut attributes: Vec<Attribute> = Vec::new();
     let mut input = None;
     let mut output = None;
     let mut fields: Vec<Field<Expression>> = vec![];
@@ -85,6 +80,30 @@ pub(crate) fn parse_value_expression_block(
                         }
 
                         Rule::comment_block => pending_field_comment = Some(item),
+                        Rule::block_attribute => {
+                            let span = item.as_span();
+                            let attribute = parse_attribute(item, false, diagnostics);
+                            let value_is_test = sub_type == Some(ValueExprBlockType::Test);
+                            let attribute_name = attribute.name.to_string();
+                            let attribute_is_constraint = &attribute_name == "check" || &attribute_name == "assert";
+
+                            // Only tests may have block attributes, and the only valid block attributes
+                            // are checks/asserts.
+                            if value_is_test && attribute_is_constraint {
+                                // value_expression_block is compatible with the attribute
+                                attributes.push(attribute);
+                            } else if !value_is_test {
+                                diagnostics.push_error(DatamodelError::new_validation_error(
+                                    &format!("Only Tests may contain block-level attributes"),
+                                    diagnostics.span(span),
+                                ))
+                            } else {
+                                diagnostics.push_error(DatamodelError::new_validation_error(
+                                    &format!("Tests may only contain 'check' or 'assert' attributes"),
+                                    diagnostics.span(span),
+                                ))
+                            }
+                        }
                         Rule::empty_lines => {}
                         Rule::BLOCK_LEVEL_CATCH_ALL => {
                             diagnostics.push_error(DatamodelError::new_validation_error(
diff --git a/engine/baml-runtime/Cargo.toml b/engine/baml-runtime/Cargo.toml
index 680bcd932..798c8070e 100644
--- a/engine/baml-runtime/Cargo.toml
+++ b/engine/baml-runtime/Cargo.toml
@@ -43,6 +43,7 @@ baml-types = { path = "../baml-lib/baml-types" }
 internal-baml-core = { path = "../baml-lib/baml-core" }
 internal-baml-jinja = { path = "../baml-lib/jinja-runtime" }
 log.workspace = true
+minijinja.workspace = true
 pin-project-lite.workspace = true
 reqwest-eventsource = "0.6.0"
 scopeguard.workspace = true
diff --git a/engine/baml-runtime/src/constraints.rs b/engine/baml-runtime/src/constraints.rs
new file mode 100644
index 000000000..d28a86092
--- /dev/null
+++ b/engine/baml-runtime/src/constraints.rs
@@ -0,0 +1,469 @@
+use baml_types::{BamlValue, BamlValueWithMeta, Constraint, ConstraintLevel, ResponseCheck};
+use internal_baml_core::ir::jinja_helpers::{evaluate_predicate, render_expression};
+use jsonish::BamlValueWithFlags;
+
+use anyhow::Result;
+use indexmap::IndexMap;
+use minijinja;
+use std::{collections::HashMap, fmt};
+
+use crate::internal::llm_client::LLMCompleteResponse;
+
+/// Evaluate a list of constraints to be applied to a `BamlValueWithFlags`, in
+/// the order that the constraints were specified by the user.
+///
+/// When a check in a test is evaluated, its results are added to the context
+/// so that future constraints can refer to it.
+pub fn evaluate_test_constraints(
+    args: &IndexMap<String, BamlValue>,
+    value: &BamlValueWithMeta<Vec<ResponseCheck>>,
+    response: &LLMCompleteResponse,
+    constraints: Vec<Constraint>,
+) -> TestConstraintsResult {
+    // Fold over all the constraints, updating both our success state, and
+    // our jinja context full of Check results.
+    // Finally, return the success state.
+    constraints
+        .into_iter()
+        .fold(Accumulator::new(), |acc, constraint| {
+            step_constraints(args, value, response, acc, constraint)
+        })
+        .result
+}
+
+/// The result of running a series of block-level constraints within a test.
+#[derive(Clone, Debug, PartialEq)]
+pub enum TestConstraintsResult {
+    /// Constraint testing finished with the following check
+    /// results, and optionally a failing assert.
+    Completed {
+        checks: Vec<(String, bool)>,
+        failed_assert: Option<String>,
+    },
+
+    /// There was a problem evaluating a constraint.
+    InternalError { details: String },
+}
+
+/// State update helper functions.
+impl TestConstraintsResult {
+    pub fn empty() -> Self {
+        TestConstraintsResult::Completed {
+            checks: Vec::new(),
+            failed_assert: None,
+        }
+    }
+    fn checks(self) -> Vec<(String, bool)> {
+        match self {
+            TestConstraintsResult::Completed { checks, .. } => checks,
+            _ => Vec::new(),
+        }
+    }
+    fn add_check_result(self, name: String, result: bool) -> Self {
+        match self {
+            TestConstraintsResult::Completed { mut checks, .. } => {
+                checks.push((name, result));
+                TestConstraintsResult::Completed {
+                    checks,
+                    failed_assert: None,
+                }
+            }
+            _ => self,
+        }
+    }
+    fn fail_assert(self, name: Option<String>) -> Self {
+        match self {
+            TestConstraintsResult::Completed { checks, .. } => TestConstraintsResult::Completed {
+                checks,
+                failed_assert: Some(name.unwrap_or("".to_string())),
+            },
+            _ => self,
+        }
+    }
+}
+
+/// The state that we track as we iterate over constraints in the test block.
+struct Accumulator {
+    pub result: TestConstraintsResult,
+    pub check_results: Vec<(String, minijinja::Value)>,
+}
+
+impl Accumulator {
+    pub fn new() -> Self {
+        Accumulator {
+            result: TestConstraintsResult::Completed {
+                checks: Vec::new(),
+                failed_assert: None,
+            },
+            check_results: Vec::new(),
+        }
+    }
+}
+
+/// The accumultator function, for running a single constraint
+/// and updating the success state and the jinja context.
+fn step_constraints(
+    args: &IndexMap<String, BamlValue>,
+    value: &BamlValueWithMeta<Vec<ResponseCheck>>,
+    response: &LLMCompleteResponse,
+    acc: Accumulator,
+    constraint: Constraint,
+) -> Accumulator {
+    // Short-circuit if we have already had a hard failure. We can skip
+    // the work in the rest of this function if we have already encountered
+    // a hard failure.
+    let already_failed = matches!(
+        acc.result,
+        TestConstraintsResult::Completed {
+            failed_assert: Some(_),
+            ..
+        }
+    ) || matches!(acc.result, TestConstraintsResult::InternalError { .. });
+    if already_failed {
+        return acc;
+    }
+
+    let mut check_results: Vec<(String, minijinja::Value)> = acc.check_results.clone();
+    let check_results_for_jinja = check_results.iter().cloned().collect::<HashMap<_, _>>();
+    let underscore = minijinja::Value::from_serialize(
+        vec![
+            ("result", minijinja::Value::from_serialize(value)),
+            (
+                "latency_ms",
+                minijinja::Value::from_serialize(response.latency.as_millis()),
+            ),
+            (
+                "checks",
+                minijinja::Value::from_serialize(check_results_for_jinja),
+            ),
+        ]
+        .into_iter()
+        .collect::<HashMap<_, _>>(),
+    );
+
+    let ctx = vec![
+        ("_".to_string(), underscore),
+        ("this".to_string(), minijinja::Value::from_serialize(value)),
+    ]
+    .into_iter()
+    .chain(
+        args.iter()
+            .map(|(name, value)| (name.to_string(), minijinja::Value::from_serialize(value))),
+    )
+    .collect();
+
+    let constraint_result_str = render_expression(&constraint.expression, &ctx);
+    let bool_result_or_internal_error: Result<bool, String> =
+        match constraint_result_str.as_ref().map(|s| s.as_str()) {
+            Ok("true") => Ok(true),
+            Ok("false") => Ok(false),
+            Ok("") => Ok(false),
+            Ok(x) => Err(format!("Expected true or false, got {x}.")),
+            Err(e) => Err(format!("Constraint error: {e:?}")),
+        };
+
+    // After running the constraint, we update the checks available in the
+    // minijinja context.
+    use ConstraintLevel::*;
+
+    // The next value of the accumulator depends on several factors:
+    //  - Whether we are processing a Check or an Assert.
+    //  - Whether the constraint has a name or not.
+    //  - The current accumulator state.
+    //  In this match block, we use the result
+    match (
+        constraint.level,
+        constraint.label,
+        bool_result_or_internal_error,
+    ) {
+        // A check ran to completion and succeeded or failed
+        // (i.e. returned a bool). This updates both the checks jinja context
+        // and the status.
+        (Check, Some(check_name), Ok(check_passed)) => {
+            check_results.push((check_name.clone(), check_passed.into()));
+            let mut new_checks = match acc.result {
+                TestConstraintsResult::Completed { checks, .. } => checks,
+                _ => Vec::new(),
+            };
+            new_checks.push((check_name, check_passed));
+            let result = TestConstraintsResult::Completed {
+                checks: new_checks,
+                failed_assert: None,
+            };
+            return Accumulator {
+                result,
+                check_results,
+            };
+        }
+
+        // Internal error always produces a hard error.
+        (_, _, Err(e)) => {
+            return Accumulator {
+                result: TestConstraintsResult::InternalError { details: e },
+                check_results: acc.check_results,
+            };
+        }
+
+        // A check without a name has no effect, and should never be observed, because
+        // the parser enforces that all checks are named.
+        (Check, None, _) => {
+            log::warn!(
+                "Encountered a check without a name: {:?}",
+                constraint.expression
+            );
+            return acc;
+        }
+
+        // A passing assert has no effect.
+        (Assert, _, Ok(true)) => {
+            return acc;
+        }
+
+        // A failing assert is a hard error.
+        (Assert, maybe_name, Ok(false)) => {
+            let result = acc.result.fail_assert(maybe_name);
+            return Accumulator {
+                result,
+                check_results,
+            };
+        }
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::internal::llm_client::{LLMCompleteResponse, LLMCompleteResponseMetadata};
+    use baml_types::{
+        BamlValueWithMeta, Constraint, ConstraintLevel, JinjaExpression, ResponseCheck,
+    };
+    use internal_baml_jinja::RenderedPrompt;
+
+    use std::collections::HashMap;
+
+    /// Construct a value to use as a test fixture.
+    /// It aims to combine a mix of:
+    ///   - top-level vs. nested constraints
+    ///   - asserts vs. checks
+    ///   - successes vs. failures
+    ///
+    /// Roughly this schema:
+    /// {
+    ///   "name": {
+    ///      value: "Greg",
+    ///      meta: [
+    ///        (@assert(good_name, {{ this|length > 0}}), true),
+    ///        (@check(long_name, {{ this|length > 4}}), false),
+    ///      ]}},
+    ///   "kids": {
+    ///     value: [
+    ///       { name: {
+    ///         value: "Tao",
+    ///         meta: (same meta as top-level name)
+    ///         },
+    ///         age: 6
+    ///       },
+    ///       { name: {
+    ///          value: "Ellie",
+    ///          meta: (same meta as top-level name, but no failing check)
+    ///          },
+    ///          age: 3
+    ///       }
+    ///     ],
+    ///     "meta": [
+    ///       (@check(has_kids, {{ this|length > 0 }}), true)
+    ///     ]
+    ///   }
+    /// }
+    fn mk_value() -> BamlValueWithMeta<Vec<ResponseCheck>> {
+        fn mk_name(name: &str) -> BamlValueWithMeta<Vec<ResponseCheck>> {
+            let meta = vec![
+                ResponseCheck {
+                    name: "good_name".to_string(),
+                    expression: "this|length > 0".to_string(),
+                    status: "succeeded".to_string(),
+                },
+                ResponseCheck {
+                    name: "long_name".to_string(),
+                    expression: "this|length > 4".to_string(),
+                    status: if name.len() > 4 {
+                        "succeeded".to_string()
+                    } else {
+                        "failed".to_string()
+                    },
+                },
+            ];
+            BamlValueWithMeta::String(name.to_string(), meta)
+        }
+
+        fn mk_child(name: &str, age: i64) -> BamlValueWithMeta<Vec<ResponseCheck>> {
+            BamlValueWithMeta::Class(
+                "child".to_string(),
+                vec![
+                    ("name".to_string(), mk_name(name)),
+                    ("age".to_string(), BamlValueWithMeta::Int(age, vec![])),
+                ]
+                .into_iter()
+                .collect(),
+                vec![],
+            )
+        }
+
+        BamlValueWithMeta::Class(
+            "parent".to_string(),
+            vec![
+                ("name".to_string(), mk_name("Greg")),
+                (
+                    "kids".to_string(),
+                    BamlValueWithMeta::List(vec![mk_child("Tao", 6), mk_child("Ellie", 3)], vec![]),
+                ),
+            ]
+            .into_iter()
+            .collect(),
+            vec![],
+        )
+    }
+
+    fn mk_response() -> LLMCompleteResponse {
+        LLMCompleteResponse {
+            client: "test_client".to_string(),
+            model: "test_model".to_string(),
+            prompt: RenderedPrompt::Completion(String::new()),
+            request_options: HashMap::new(),
+            content: String::new(),
+            start_time: web_time::SystemTime::UNIX_EPOCH,
+            latency: web_time::Duration::from_millis(500),
+            metadata: LLMCompleteResponseMetadata {
+                baml_is_complete: true,
+                finish_reason: None,
+                prompt_tokens: None,
+                output_tokens: None,
+                total_tokens: None,
+            },
+        }
+    }
+
+    fn mk_check(label: &str, expr: &str) -> Constraint {
+        Constraint {
+            label: Some(label.to_string()),
+            level: ConstraintLevel::Check,
+            expression: JinjaExpression(expr.to_string()),
+        }
+    }
+
+    fn mk_assert(label: &str, expr: &str) -> Constraint {
+        Constraint {
+            label: Some(label.to_string()),
+            level: ConstraintLevel::Assert,
+            expression: JinjaExpression(expr.to_string()),
+        }
+    }
+
+    fn run_pipeline(constraints: &[Constraint]) -> TestConstraintsResult {
+        let args = IndexMap::new();
+        let value = mk_value();
+        let constraints = constraints.into();
+        let response = mk_response();
+        evaluate_test_constraints(&args, &value, &response, constraints)
+    }
+
+    #[test]
+    fn basic_test_constraints() {
+        let res = run_pipeline(&[mk_assert("has_kids", "_.result.kids|length > 0")]);
+        assert_eq!(
+            res,
+            TestConstraintsResult::Completed {
+                checks: vec![],
+                failed_assert: None,
+            }
+        );
+    }
+
+    #[test]
+    fn test_dependencies() {
+        let res = run_pipeline(&[
+            mk_check("has_kids", "_.result.kids|length > 0"),
+            mk_check("not_too_many", "this.kids.length < 100"),
+            mk_assert("both_pass", "_.checks.has_kids and _.checks.not_too_many"),
+        ]);
+        assert_eq!(
+            res,
+            TestConstraintsResult::Completed {
+                checks: vec![
+                    ("has_kids".to_string(), true),
+                    ("not_too_many".to_string(), true),
+                ],
+                failed_assert: None
+            }
+        );
+    }
+
+    #[test]
+    fn test_dependencies_non_check() {
+        let res = run_pipeline(&[
+            mk_assert("has_kids", "_.result.kids|length > 0"),
+            mk_check("not_too_many", "this.kids.length < 100"),
+            mk_assert("both_pass", "_.checks.has_kids and _.checks.not_too_many"),
+        ]);
+        // This constraint set should fail because `has_kids` is an assert, not
+        // a check, therefore it doesn't get a field in `checks`.
+        assert_eq!(
+            res,
+            TestConstraintsResult::Completed {
+                checks: vec![("not_too_many".to_string(), true),],
+                failed_assert: Some("both_pass".to_string())
+            }
+        );
+    }
+
+    #[test]
+    fn test_fast_is_sufficient() {
+        let res = run_pipeline(&[
+            mk_check("has_kids", "_.result.kids|length > 0"),
+            mk_check("not_too_many", "this.kids.length < 100"),
+            mk_check("both_pass", "_.checks.has_kids and _.checks.not_too_many"),
+            mk_assert("either_or", "_.checks.both_pass or _.latency_ms < 1000"),
+        ]);
+        assert_eq!(
+            res,
+            TestConstraintsResult::Completed {
+                checks: vec![
+                    ("has_kids".to_string(), true),
+                    ("not_too_many".to_string(), true),
+                    ("both_pass".to_string(), true),
+                ],
+                failed_assert: None
+            }
+        );
+    }
+
+    #[test]
+    fn test_failing_checks() {
+        let res = run_pipeline(&[
+            mk_check("has_kids", "_.result.kids|length > 0"),
+            mk_check("not_too_many", "this.kids.length < 100"),
+            mk_assert("both_pass", "_.checks.has_kids and _.checks.not_too_many"),
+            mk_check("no_kids", "this.kids|length == 0"),
+            mk_check("way_too_many", "this.kids|length > 1000"),
+        ]);
+        assert_eq!(
+            res,
+            TestConstraintsResult::Completed {
+                checks: vec![
+                    ("has_kids".to_string(), true),
+                    ("not_too_many".to_string(), true),
+                    ("no_kids".to_string(), false),
+                    ("way_too_many".to_string(), false)
+                ],
+                failed_assert: None
+            }
+        );
+    }
+
+    #[test]
+    fn test_internal_error() {
+        let res = run_pipeline(&[mk_check("faulty", "__.result.kids|length > 0")]);
+        // This test fails because there is a typo: `__` (double underscore).
+        assert!(matches!(res, TestConstraintsResult::InternalError { .. }));
+    }
+}
diff --git a/engine/baml-runtime/src/internal/llm_client/mod.rs b/engine/baml-runtime/src/internal/llm_client/mod.rs
index d61ab0caf..352d0dbee 100644
--- a/engine/baml-runtime/src/internal/llm_client/mod.rs
+++ b/engine/baml-runtime/src/internal/llm_client/mod.rs
@@ -27,10 +27,10 @@ use wasm_bindgen::JsValue;
 pub type ResponseBamlValue = BamlValueWithMeta<Vec<ResponseCheck>>;
 
 /// Validate a parsed value, checking asserts and checks.
-pub fn parsed_value_to_response(baml_value: &BamlValueWithFlags) -> Result<ResponseBamlValue> {
+pub fn parsed_value_to_response(baml_value: &BamlValueWithFlags) -> ResponseBamlValue {
     let baml_value_with_meta: BamlValueWithMeta<Vec<(String, JinjaExpression, bool)>> =
         baml_value.clone().into();
-    Ok(baml_value_with_meta.map_meta(|cs| {
+    baml_value_with_meta.map_meta(|cs| {
         cs.iter()
             .map(|(label, expr, result)| {
                 let status = (if *result { "succeeded" } else { "failed" }).to_string();
@@ -41,7 +41,7 @@ pub fn parsed_value_to_response(baml_value: &BamlValueWithFlags) -> Result<Respo
                 }
             })
             .collect()
-    }))
+    })
 }
 
 #[derive(Clone, Copy, PartialEq)]
diff --git a/engine/baml-runtime/src/internal/llm_client/orchestrator/call.rs b/engine/baml-runtime/src/internal/llm_client/orchestrator/call.rs
index 735d5f33c..c8c7dd74c 100644
--- a/engine/baml-runtime/src/internal/llm_client/orchestrator/call.rs
+++ b/engine/baml-runtime/src/internal/llm_client/orchestrator/call.rs
@@ -56,7 +56,7 @@ pub async fn orchestrate(
 
         let sleep_duration = node.error_sleep_duration().cloned();
         let (parsed_response, response_with_constraints) = match parsed_response {
-                Some(Ok(v)) => (Some(Ok(v.clone())), Some(parsed_value_to_response(&v))),
+                Some(Ok(v)) => (Some(Ok(v.clone())), Some(Ok(parsed_value_to_response(&v)))),
                 Some(Err(e)) => (None, Some(Err(e))),
                 None => (None, None),
             };
diff --git a/engine/baml-runtime/src/internal/llm_client/orchestrator/stream.rs b/engine/baml-runtime/src/internal/llm_client/orchestrator/stream.rs
index 680733c24..74750a8bd 100644
--- a/engine/baml-runtime/src/internal/llm_client/orchestrator/stream.rs
+++ b/engine/baml-runtime/src/internal/llm_client/orchestrator/stream.rs
@@ -66,7 +66,7 @@ where
                             LLMResponse::Success(s) => {
                                 let parsed = partial_parse_fn(&s.content);
                                 let (parsed, response_value) = match parsed {
-                                    Ok(v) => (Some(Ok(v.clone())), Some(parsed_value_to_response(&v))),
+                                    Ok(v) => (Some(Ok(v.clone())), Some(Ok(parsed_value_to_response(&v)))),
                                     Err(e) => (None, Some(Err(e))),
                                 };
                                 on_event(FunctionResult::new(
@@ -103,7 +103,7 @@ where
             _ => None,
         };
         let (parsed_response, response_value) = match parsed_response {
-            Some(Ok(v)) => (Some(Ok(v.clone())), Some(parsed_value_to_response(&v))),
+            Some(Ok(v)) => (Some(Ok(v.clone())), Some(Ok(parsed_value_to_response(&v)))),
             Some(Err(e)) => (None, Some(Err(e))),
             None => (None, None),
         };
diff --git a/engine/baml-runtime/src/lib.rs b/engine/baml-runtime/src/lib.rs
index d4659ead4..5308ed79c 100644
--- a/engine/baml-runtime/src/lib.rs
+++ b/engine/baml-runtime/src/lib.rs
@@ -8,6 +8,7 @@ pub(crate) mod internal;
 #[cfg(not(target_arch = "wasm32"))]
 pub mod cli;
 pub mod client_registry;
+pub mod constraints;
 pub mod errors;
 pub mod request;
 mod runtime;
@@ -25,6 +26,7 @@ use anyhow::Result;
 
 use baml_types::BamlMap;
 use baml_types::BamlValue;
+use baml_types::Constraint;
 use cfg_if::cfg_if;
 use client_registry::ClientRegistry;
 use indexmap::IndexMap;
@@ -62,6 +64,9 @@ pub use internal_baml_core::internal_baml_diagnostics;
 pub use internal_baml_core::internal_baml_diagnostics::Diagnostics as DiagnosticsError;
 pub use internal_baml_core::ir::{scope_diagnostics, FieldType, IRHelper, TypeValue};
 
+use crate::constraints::{evaluate_test_constraints, TestConstraintsResult};
+use crate::internal::llm_client::LLMResponse;
+
 #[cfg(not(target_arch = "wasm32"))]
 static TOKIO_SINGLETON: OnceLock<std::io::Result<Arc<tokio::runtime::Runtime>>> = OnceLock::new();
 
@@ -179,13 +184,27 @@ impl BamlRuntime {
 }
 
 impl BamlRuntime {
+    pub fn get_test_params_and_constraints(
+        &self,
+        function_name: &str,
+        test_name: &str,
+        ctx: &RuntimeContext,
+    ) -> Result<(BamlMap<String, BamlValue>, Vec<Constraint>)> {
+        let params = self.inner.get_test_params(function_name, test_name, ctx)?;
+        let constraints = self
+            .inner
+            .get_test_constraints(function_name, test_name, &ctx)?;
+        Ok((params, constraints))
+    }
+
     pub fn get_test_params(
         &self,
         function_name: &str,
         test_name: &str,
         ctx: &RuntimeContext,
     ) -> Result<BamlMap<String, BamlValue>> {
-        self.inner.get_test_params(function_name, test_name, ctx)
+        let (params, _) = self.get_test_params_and_constraints(function_name, test_name, ctx)?;
+        Ok(params)
     }
 
     pub async fn run_test<F>(
@@ -200,40 +219,50 @@ impl BamlRuntime {
     {
         let span = self.tracer.start_span(test_name, ctx, &Default::default());
 
-        let response = match ctx.create_ctx(None, None) {
-            Ok(rctx) => {
-                let params = self.get_test_params(function_name, test_name, &rctx);
-                match params {
-                    Ok(params) => match ctx.create_ctx(None, None) {
-                        Ok(rctx_stream) => {
-                            let stream = self.inner.stream_function_impl(
-                                function_name.into(),
-                                &params,
-                                self.tracer.clone(),
-                                rctx_stream,
-                                #[cfg(not(target_arch = "wasm32"))]
-                                self.async_runtime.clone(),
-                            );
-                            match stream {
-                                Ok(mut stream) => {
-                                    let (response, span) =
-                                        stream.run(on_event, ctx, None, None).await;
-                                    response.map(|res| TestResponse {
-                                        function_response: res,
-                                        function_span: span,
-                                    })
-                                }
-                                Err(e) => Err(e),
-                            }
-                        }
-                        Err(e) => Err(e),
-                    },
-                    Err(e) => Err(e),
+        let run_to_response = || async {
+            let rctx = ctx.create_ctx(None, None)?;
+            let (params, constraints) =
+                self.get_test_params_and_constraints(function_name, test_name, &rctx)?;
+            let rctx_stream = ctx.create_ctx(None, None)?;
+            let mut stream = self.inner.stream_function_impl(
+                function_name.into(),
+                &params,
+                self.tracer.clone(),
+                rctx_stream,
+                #[cfg(not(target_arch = "wasm32"))]
+                self.async_runtime.clone(),
+            )?;
+            let (response_res, span_uuid) = stream.run(on_event, ctx, None, None).await;
+            let res = response_res?;
+            let (_, llm_resp, _, val) = res
+                .event_chain()
+                .iter()
+                .last()
+                .context("Expected non-empty event chain")?;
+            let complete_resp = match llm_resp {
+                LLMResponse::Success(complete_llm_response) => Ok(complete_llm_response),
+                _ => Err(anyhow::anyhow!("LLM Response was not successful")),
+            }?;
+            let test_constraints_result = if constraints.is_empty() {
+                TestConstraintsResult::empty()
+            } else {
+                match val {
+                    Some(Ok(value)) => {
+                        evaluate_test_constraints(&params, &value, &complete_resp, constraints)
+                    }
+                    _ => TestConstraintsResult::empty(),
                 }
-            }
-            Err(e) => Err(e),
+            };
+            let test_response = Ok(TestResponse {
+                function_response: res,
+                function_span: span_uuid,
+                constraints_result: test_constraints_result,
+            });
+            test_response
         };
 
+        let response = run_to_response().await;
+
         let mut target_id = None;
         if let Some(span) = span {
             #[cfg(not(target_arch = "wasm32"))]
diff --git a/engine/baml-runtime/src/runtime/runtime_interface.rs b/engine/baml-runtime/src/runtime/runtime_interface.rs
index 62771412c..db850ad73 100644
--- a/engine/baml-runtime/src/runtime/runtime_interface.rs
+++ b/engine/baml-runtime/src/runtime/runtime_interface.rs
@@ -24,7 +24,7 @@ use crate::{
     RuntimeContext, RuntimeInterface,
 };
 use anyhow::{Context, Result};
-use baml_types::{BamlMap, BamlValue};
+use baml_types::{BamlMap, BamlValue, Constraint};
 use internal_baml_core::{
     internal_baml_diagnostics::SourceFile,
     ir::{
@@ -281,6 +281,14 @@ impl InternalRuntimeInterface for InternalBamlRuntime {
             Err(e) => return Err(anyhow::anyhow!("Unable to resolve test params: {:?}", e)),
         }
     }
+
+    fn get_test_constraints(
+        &self, function_name: &str, test_name: &str, ctx: &RuntimeContext
+    ) -> Result<Vec<Constraint>> {
+        let func = self.get_function(function_name, ctx)?;
+        let walker = self.ir().find_test(&func, test_name)?;
+        Ok(walker.item.1.elem.constraints.clone())
+    }
 }
 
 impl RuntimeConstructor for InternalBamlRuntime {
diff --git a/engine/baml-runtime/src/runtime_interface.rs b/engine/baml-runtime/src/runtime_interface.rs
index 396e646f9..fd1149007 100644
--- a/engine/baml-runtime/src/runtime_interface.rs
+++ b/engine/baml-runtime/src/runtime_interface.rs
@@ -1,5 +1,5 @@
 use anyhow::Result;
-use baml_types::{BamlMap, BamlValue};
+use baml_types::{BamlMap, BamlValue, Constraint};
 use internal_baml_core::internal_baml_diagnostics::Diagnostics;
 use internal_baml_core::ir::repr::ClientSpec;
 use internal_baml_core::ir::{repr::IntermediateRepr, FunctionWalker};
@@ -159,4 +159,11 @@ pub trait InternalRuntimeInterface {
         test_name: &str,
         ctx: &RuntimeContext,
     ) -> Result<BamlMap<String, BamlValue>>;
+
+    fn get_test_constraints(
+        &self,
+        function_name: &str,
+        test_name: &str,
+        ctx: &RuntimeContext
+    ) -> Result<Vec<Constraint>>;
 }
diff --git a/engine/baml-runtime/src/types/response.rs b/engine/baml-runtime/src/types/response.rs
index 02ef5d4cd..f0581091d 100644
--- a/engine/baml-runtime/src/types/response.rs
+++ b/engine/baml-runtime/src/types/response.rs
@@ -1,5 +1,6 @@
 pub use crate::internal::llm_client::LLMResponse;
 use crate::{
+    constraints::TestConstraintsResult,
     errors::ExposedError,
     internal::llm_client::{orchestrator::OrchestrationScope, ResponseBamlValue},
 };
@@ -182,9 +183,11 @@ impl FunctionResult {
     }
 }
 
+#[derive(Debug)]
 pub struct TestResponse {
     pub function_response: FunctionResult,
     pub function_span: Option<uuid::Uuid>,
+    pub constraints_result: TestConstraintsResult,
 }
 
 impl std::fmt::Display for TestResponse {
@@ -196,6 +199,7 @@ impl std::fmt::Display for TestResponse {
 #[derive(Debug, PartialEq, Eq)]
 pub enum TestStatus<'a> {
     Pass,
+    NeedsHumanEval(Vec<String>),
     Fail(TestFailReason<'a>),
 }
 
@@ -203,6 +207,9 @@ impl From<TestStatus<'_>> for BamlValue {
     fn from(status: TestStatus) -> Self {
         match status {
             TestStatus::Pass => BamlValue::String("pass".to_string()),
+            TestStatus::NeedsHumanEval(checks) => {
+                BamlValue::String(format!("checks need human evaluation: {:?}", checks))
+            }
             TestStatus::Fail(r) => BamlValue::String(format!("failed! {:?}", r)),
         }
     }
@@ -210,9 +217,13 @@ impl From<TestStatus<'_>> for BamlValue {
 
 #[derive(Debug)]
 pub enum TestFailReason<'a> {
-    TestUnspecified(&'a anyhow::Error),
+    TestUnspecified(anyhow::Error),
     TestLLMFailure(&'a LLMResponse),
     TestParseFailure(&'a anyhow::Error),
+    TestConstraintsFailure {
+        checks: Vec<(String, bool)>,
+        failed_assert: Option<String>,
+    },
 }
 
 impl PartialEq for TestFailReason<'_> {
@@ -235,7 +246,26 @@ impl TestResponse {
         let func_res = &self.function_response;
         if let Some(parsed) = func_res.result_with_constraints() {
             if parsed.is_ok() {
-                TestStatus::Pass
+                match self.constraints_result.clone() {
+                    TestConstraintsResult::InternalError { details } => {
+                        TestStatus::Fail(TestFailReason::TestUnspecified(anyhow::anyhow!(details)))
+                    }
+                    TestConstraintsResult::Completed {
+                        checks,
+                        failed_assert,
+                    } => {
+                        let n_failed_checks: usize =
+                            checks.iter().filter(|(_, pass)| !pass).count();
+                        if failed_assert.is_some() || n_failed_checks > 0 {
+                            TestStatus::Fail(TestFailReason::TestConstraintsFailure {
+                                checks,
+                                failed_assert,
+                            })
+                        } else {
+                            TestStatus::Pass
+                        }
+                    }
+                }
             } else {
                 TestStatus::Fail(TestFailReason::TestParseFailure(
                     parsed.as_ref().unwrap_err(),
diff --git a/engine/baml-schema-wasm/Cargo.toml b/engine/baml-schema-wasm/Cargo.toml
index 4ca8c00a3..d83bcede2 100644
--- a/engine/baml-schema-wasm/Cargo.toml
+++ b/engine/baml-schema-wasm/Cargo.toml
@@ -36,6 +36,7 @@ wasm-bindgen-futures = "0.4.42"
 wasm-logger = { version = "0.2.0" }
 web-time.workspace = true
 either = "1.8.1"
+itertools = "0.13.0"
 
 [dependencies.web-sys]
 version = "0.3.69"
diff --git a/engine/baml-schema-wasm/src/runtime_wasm/mod.rs b/engine/baml-schema-wasm/src/runtime_wasm/mod.rs
index 1e72eb67b..62557b82e 100644
--- a/engine/baml-schema-wasm/src/runtime_wasm/mod.rs
+++ b/engine/baml-schema-wasm/src/runtime_wasm/mod.rs
@@ -20,6 +20,7 @@ use jsonish::deserializer::deserialize_flags::Flag;
 use jsonish::BamlValueWithFlags;
 
 use baml_runtime::internal::llm_client::orchestrator::ExecutionScope;
+use itertools::join;
 use js_sys::Promise;
 use js_sys::Uint8Array;
 use serde::{Deserialize, Serialize};
@@ -397,6 +398,7 @@ pub struct WasmFunctionResponse {
 }
 
 #[wasm_bindgen]
+#[derive(Debug)]
 pub struct WasmTestResponse {
     test_response: anyhow::Result<baml_runtime::TestResponse>,
     span: Option<uuid::Uuid>,
@@ -415,10 +417,13 @@ pub struct WasmParsedTestResponse {
 }
 
 #[wasm_bindgen]
+#[derive(Clone, Debug)]
 pub enum TestStatus {
     Passed,
     LLMFailure,
     ParseFailure,
+    ConstraintsFailed,
+    AssertFailed,
     UnableToRun,
 }
 
@@ -501,7 +506,19 @@ impl WasmFunctionResponse {
     }
 }
 
+// TODO: What is supposed to happen with the serialized baml_value?
+// That value has checks nested inside. Are they meant to be removed
+// during flattening? Or duplicated into the top-level list of checks?
 fn flatten_checks(value: &BamlValueWithFlags) -> (serde_json::Value, usize) {
+    // // Note: (Greg) depending on the goal of this function, we may be able
+    // // to replace most of it like this:
+    // let value_with_meta: BamlValueWithMeta<Vec<ResponseCheck>> = parsed_value_to_response(value);
+    // let n_checks: usize = value_with_meta.iter().map(|node| node.meta().len()).sum();
+    // let bare_baml_value: BamlValue = value_with_meta.into();
+    // let json_value: serde_json::Value = serde_json::to_value(bare_baml_value).unwrap_or(
+    //     "Error converting value to JSON".into()
+    // );
+
     type J = serde_json::Value;
 
     let checks = value
@@ -511,12 +528,7 @@ fn flatten_checks(value: &BamlValueWithFlags) -> (serde_json::Value, usize) {
         .flat_map(|f| match f {
             Flag::ConstraintResults(c) => c
                 .iter()
-                .map(|(label, _expr, b)| {
-                    (
-                        label.clone(),
-                        *b,
-                    )
-                })
+                .map(|(label, _expr, b)| (label.clone(), *b))
                 .collect::<Vec<_>>(),
             _ => vec![],
         })
@@ -580,10 +592,20 @@ impl WasmTestResponse {
         match &self.test_response {
             Ok(t) => match t.status() {
                 baml_runtime::TestStatus::Pass => TestStatus::Passed,
+                baml_runtime::TestStatus::NeedsHumanEval(_) => TestStatus::ConstraintsFailed,
                 baml_runtime::TestStatus::Fail(r) => match r {
                     baml_runtime::TestFailReason::TestUnspecified(_) => TestStatus::UnableToRun,
                     baml_runtime::TestFailReason::TestLLMFailure(_) => TestStatus::LLMFailure,
                     baml_runtime::TestFailReason::TestParseFailure(_) => TestStatus::ParseFailure,
+                    baml_runtime::TestFailReason::TestConstraintsFailure {
+                        failed_assert, ..
+                    } => {
+                        if failed_assert.is_some() {
+                            TestStatus::AssertFailed
+                        } else {
+                            TestStatus::ConstraintsFailed
+                        }
+                    }
                 },
             },
             Err(_) => TestStatus::UnableToRun,
@@ -645,6 +667,10 @@ impl WasmTestResponse {
             Ok(r) => match r.status() {
                 baml_runtime::TestStatus::Pass => None,
                 baml_runtime::TestStatus::Fail(r) => r.render_error(),
+                baml_runtime::TestStatus::NeedsHumanEval(checks) => Some(format!(
+                    "Checks require human validation: {}",
+                    join(checks, ", ")
+                )),
             },
             Err(e) => Some(format!("{e:#}")),
         }
@@ -759,6 +785,23 @@ impl WithRenderError for baml_runtime::TestFailReason<'_> {
             baml_runtime::TestFailReason::TestUnspecified(e) => Some(format!("{e:#}")),
             baml_runtime::TestFailReason::TestLLMFailure(f) => f.render_error(),
             baml_runtime::TestFailReason::TestParseFailure(e) => Some(format!("{e:#}")),
+            baml_runtime::TestFailReason::TestConstraintsFailure {
+                checks,
+                failed_assert,
+            } => {
+                let checks_msg = if checks.len() > 0 {
+                    let check_msgs = checks.into_iter().map(|(name, pass)| {
+                        format!("{name}: {}", if *pass { "Passed" } else { "Failed" })
+                    });
+                    format!("Check results:\n{}", join(check_msgs, "\n"))
+                } else {
+                    String::new()
+                };
+                let assert_msg = failed_assert
+                    .as_ref()
+                    .map_or("".to_string(), |name| format!("\nFailed assert: {name}"));
+                Some(format!("{checks_msg}{assert_msg}"))
+            }
         }
     }
 }
diff --git a/typescript/playground-common/src/baml_wasm_web/test_uis/testHooks.ts b/typescript/playground-common/src/baml_wasm_web/test_uis/testHooks.ts
index 4c7647808..a294054e6 100644
--- a/typescript/playground-common/src/baml_wasm_web/test_uis/testHooks.ts
+++ b/typescript/playground-common/src/baml_wasm_web/test_uis/testHooks.ts
@@ -10,7 +10,7 @@ export const showTestsAtom = atom(false)
 export const showClientGraphAtom = atom(false)
 
 export type TestStatusType = 'queued' | 'running' | 'done' | 'error'
-export type DoneTestStatusType = 'passed' | 'llm_failed' | 'parse_failed' | 'error'
+export type DoneTestStatusType = 'passed' | 'llm_failed' | 'parse_failed' | 'constraints_failed' | 'error'
 export type TestState =
   | {
       status: 'queued'
@@ -44,11 +44,45 @@ export const statusCountAtom = atom({
     passed: 0,
     llm_failed: 0,
     parse_failed: 0,
+    constraints_failed: 0,
     error: 0,
   },
   error: 0,
 })
 
+/// This atom will track the state of the full test suite.
+/// 'unknown` means tests haven't been run yet. `pass` means
+/// they have all run to completion.
+/// 'warn' means at least one check has failed, and `fail`
+/// means at least one assert has failed, or an internal error
+/// occurred.
+export type TestSuiteSummary = 'pass' | 'warn' | 'fail' | 'unknown'
+export const testSuiteSummaryAtom = atom<TestSuiteSummary>('unknown')
+
+/// For an old summary and a new result, compute the new summary.
+/// The new summary will overwrite the old, unless the old one
+/// has higher priority.
+function updateTestSuiteState(old_result: TestSuiteSummary, new_result: TestSuiteSummary): TestSuiteSummary {
+  function priority(x: TestSuiteSummary): number {
+    switch (x) {
+      case 'unknown':
+        return 0
+      case 'pass':
+        return 1
+      case 'warn':
+        return 2
+      case 'fail':
+        return 3
+    }
+  }
+
+  if (priority(new_result) > priority(old_result)) {
+    return new_result
+  } else {
+    return old_result
+  }
+}
+
 export const useRunHooks = () => {
   const isRunning = useAtomValue(isRunningAtom)
 
@@ -68,6 +102,7 @@ export const useRunHooks = () => {
         }
         set(isRunningAtom, true)
         set(showTestsAtom, true)
+        set(testSuiteSummaryAtom, 'unknown')
 
         vscode.postMessage({
           command: 'telemetry',
@@ -92,6 +127,7 @@ export const useRunHooks = () => {
             passed: 0,
             llm_failed: 0,
             parse_failed: 0,
+            constraints_failed: 0,
             error: 0,
           },
           error: 0,
@@ -144,7 +180,7 @@ export const useRunHooks = () => {
               const { res, elapsed } = result.value
               // console.log('result', i, result.value.res.llm_response(), 'batch[i]', batch[i])
 
-              let status = res.status()
+              let status: Number = res.status()
               let response_status: DoneTestStatusType = 'error'
               if (status === 0) {
                 response_status = 'passed'
@@ -152,6 +188,8 @@ export const useRunHooks = () => {
                 response_status = 'llm_failed'
               } else if (status === 2) {
                 response_status = 'parse_failed'
+              } else if (status === 3 || status === 4) {
+                response_status = 'constraints_failed'
               } else {
                 response_status = 'error'
               }
@@ -171,6 +209,23 @@ export const useRunHooks = () => {
                   running: prev.running - 1,
                 }
               })
+
+              let newTestSuiteStatus: TestSuiteSummary = 'unknown'
+              if (status === 0) {
+                newTestSuiteStatus = 'pass'
+              } else if (status === 1) {
+                newTestSuiteStatus = 'fail'
+              } else if (status === 2) {
+                newTestSuiteStatus = 'fail'
+              } else if (status === 3) {
+                newTestSuiteStatus = 'warn'
+              } else if (status === 4) {
+                newTestSuiteStatus = 'fail'
+              }
+
+              let currentSummary = get(testSuiteSummaryAtom)
+              let updatedSummary = updateTestSuiteState(currentSummary, newTestSuiteStatus)
+              set(testSuiteSummaryAtom, updatedSummary)
             } else {
               set(testStatusAtom(batch[i]), { status: 'error', message: `${result.reason}` })
               set(statusCountAtom, (prev) => {
diff --git a/typescript/playground-common/src/baml_wasm_web/test_uis/test_result.tsx b/typescript/playground-common/src/baml_wasm_web/test_uis/test_result.tsx
index b73114f8e..abd80b5cb 100644
--- a/typescript/playground-common/src/baml_wasm_web/test_uis/test_result.tsx
+++ b/typescript/playground-common/src/baml_wasm_web/test_uis/test_result.tsx
@@ -9,6 +9,7 @@ import {
   runningTestsAtom,
   statusCountAtom,
   testStatusAtom,
+  testSuiteSummaryAtom,
   DoneTestStatusType,
   useRunHooks,
   showTestsAtom,
@@ -57,6 +58,8 @@ const TestStatusMessage: React.FC<{ testStatus: DoneTestStatusType }> = ({ testS
       return <div className='text-vscode-testing-iconFailed'>LLM Failed</div>
     case 'parse_failed':
       return <div className='text-vscode-testing-iconFailed'>Parse Failed</div>
+    case 'constraints_failed':
+      return <div className='text-vscode-testing-iconFailed'>Constraints Failed</div>
     case 'error':
       return <div className='text-vscode-testing-iconFailed'>Unable to run</div>
   }
@@ -98,8 +101,10 @@ const TestStatusIcon: React.FC<{
   )
 }
 
-type FilterValues = 'queued' | 'running' | 'error' | 'llm_failed' | 'parse_failed' | 'passed'
-const filterAtom = atom(new Set<FilterValues>(['running', 'error', 'llm_failed', 'parse_failed', 'passed']))
+type FilterValues = 'queued' | 'running' | 'error' | 'llm_failed' | 'parse_failed' | 'constraints_failed' | 'passed'
+const filterAtom = atom(
+  new Set<FilterValues>(['running', 'error', 'llm_failed', 'parse_failed', 'constraints_failed', 'passed']),
+)
 
 const checkFilter = (filter: Set<FilterValues>, status: TestStatusType, test_status?: DoneTestStatusType) => {
   if (filter.size === 0) {
@@ -218,7 +223,9 @@ const ParsedTestResult: React.FC<{ doneStatus: string; parsed?: WasmParsedTestRe
             </>
           ) : (
             <>
-              {failure && <pre className='text-xs whitespace-pre-wrap text-vscode-errorForeground'>{failure}</pre>}
+              {failure && doneStatus === 'parse_failed' && (
+                <pre className='text-xs whitespace-pre-wrap text-vscode-errorForeground'>{failure}</pre>
+              )}
               {parsed !== undefined && (
                 <>
                   <JsonView
@@ -537,6 +544,12 @@ const TestStatusBanner: React.FC = () => {
         count={statusCounts.done.parse_failed}
         onClick={() => toggleFilter('parse_failed')}
       />
+      <FilterButton
+        selected={filter.has('constraints_failed')}
+        name='Constraints Failed'
+        count={statusCounts.done.constraints_failed}
+        onClick={() => toggleFilter('constraints_failed')}
+      />
       <FilterButton
         selected={filter.has('passed')}
         name='Passed'
@@ -653,6 +666,7 @@ const TestResults: React.FC = () => {
   const selectedFunction = useAtomValue(selectedFunctionAtom)
   const [showTests, setShowTests] = useAtom(showTestsAtom)
   const [showClientGraph, setClientGraph] = useAtom(showClientGraphAtom)
+  const [testSuiteSummary] = useAtom(testSuiteSummaryAtom)
 
   // reset the tab when switching funcs
   useEffect(() => {
@@ -660,6 +674,22 @@ const TestResults: React.FC = () => {
   }, [selectedFunction?.name])
   const isNextJs = (window as any).next?.version
 
+  let testSuiteIcon = <span> ❌</span>
+  switch (testSuiteSummary) {
+    case 'fail':
+      testSuiteIcon = <span> ❌</span>
+      break
+    case 'pass':
+      testSuiteIcon = <span> ✅</span>
+      break
+    case 'warn':
+      testSuiteIcon = <span className='text-yellow-500'>⚠️</span>
+      break
+    case 'unknown':
+      testSuiteIcon = <span> </span>
+      break
+  }
+
   return (
     <div className='flex flex-col gap-2 px-1 w-full'>
       <div className='flex flex-row gap-2 items-center'>
@@ -689,7 +719,9 @@ const TestResults: React.FC = () => {
             setClientGraph(false)
           }}
         >
-          Test Results
+          <div className='flex gap-1'>
+            <span>Test Results</span> {testSuiteIcon}
+          </div>
         </Badge>
         <Badge
           className={clsx(