Merge pull request #259 from lmnr-ai/dev

Allow disabling tracing, search by spans
lmnr-ai · Dec 4, 2024 · 8b91d3a · 8b91d3a
2 parents d244713 + 2aabcf2
commit 8b91d3a
Show file tree

Hide file tree

Showing 16 changed files with 509 additions and 10 deletions.
diff --git a/app-server/src/traces/producer.rs b/app-server/src/traces/producer.rs
@@ -36,6 +36,10 @@ pub async fn push_spans_to_queue(
                 for otel_span in scope_span.spans {
                     let mut span = Span::from_otel_span(otel_span.clone());
 
+                    if !span.should_save() {
+                        continue;
+                    }
+
                     let span_usage = super::utils::get_llm_usage_for_span(
                         &mut span.get_attributes(),
                         db.clone(),
@@ -94,6 +98,10 @@ pub async fn push_spans_to_queue(
                     }
                 }
 
+                if !span.should_save() {
+                    continue;
+                }
+
                 let rabbitmq_span_message = RabbitMqSpanMessage {
                     project_id,
                     span,

diff --git a/app-server/src/traces/spans.rs b/app-server/src/traces/spans.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, sync::Arc};
 use anyhow::Result;
 use chrono::{TimeZone, Utc};
 use regex::Regex;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use serde_json::{json, Value};
 use uuid::Uuid;
 
@@ -38,6 +38,14 @@ const OUTPUT_ATTRIBUTE_NAME: &str = "lmnr.span.output";
 /// null. We hackily use this when we wrap a span in a NonRecordingSpan that
 /// is not sent to the backend – this is done to overwrite trace IDs for spans.
 const OVERRIDE_PARENT_SPAN_ATTRIBUTE_NAME: &str = "lmnr.internal.override_parent_span";
+const TRACING_LEVEL_ATTRIBUTE_NAME: &str = "lmnr.internal.tracing_level";
+
+#[derive(Debug, Clone, PartialEq, Deserialize)]
+#[serde(rename_all = "snake_case")]
+enum TracingLevel {
+    Off,
+    MetaOnly,
+}
 
 pub struct SpanAttributes {
     pub attributes: HashMap<String, Value>,
@@ -239,6 +247,12 @@ impl SpanAttributes {
         }
         res
     }
+
+    fn tracing_level(&self) -> Option<TracingLevel> {
+        self.attributes
+            .get(TRACING_LEVEL_ATTRIBUTE_NAME)
+            .and_then(|s| serde_json::from_value(s.clone()).ok())
+    }
 }
 
 impl Span {
@@ -253,6 +267,10 @@ impl Span {
         self.attributes = serde_json::to_value(&attributes.attributes).unwrap();
     }
 
+    pub fn should_save(&self) -> bool {
+        self.get_attributes().tracing_level() != Some(TracingLevel::Off)
+    }
+
     /// Create a span from an OpenTelemetry span.
     ///
     /// This is called on the producer side of the MQ, i.e. at the OTel ingester
@@ -379,6 +397,11 @@ impl Span {
             span.parent_span_id = None;
         }
 
+        if let Some(TracingLevel::MetaOnly) = span.get_attributes().tracing_level() {
+            span.input = None;
+            span.output = None;
+        }
+
         span
     }
 

diff --git a/frontend/app/api/projects/[projectId]/spans/route.ts b/frontend/app/api/projects/[projectId]/spans/route.ts
@@ -1,4 +1,4 @@
-import { and, desc, eq, getTableColumns, inArray, sql } from 'drizzle-orm';
+import { and, desc, eq, getTableColumns, inArray, or, sql } from 'drizzle-orm';
 import { FilterDef, filtersToSql } from '@/lib/db/modifiers';
 import { getDateRangeFilters, paginatedGet } from '@/lib/db/utils';
 import { labelClasses, labels, spans, traces } from '@/lib/db/migrations/schema';
@@ -50,6 +50,14 @@ export async function GET(
       );
     });
 
+  const textSearch = req.nextUrl.searchParams.get("search");
+  const textSearchFilters = textSearch ? [
+    or(
+      sql`input::text LIKE ${`%${textSearch}%`}::text`,
+      sql`output::text LIKE ${`%${textSearch}%`}::text`
+    )!
+  ] : [];
+
   urlParamFilters = urlParamFilters
     // labels are handled separately above
     .filter(filter => filter.column !== "labels")
@@ -103,7 +111,7 @@ export async function GET(
     sql`project_id = ${projectId}`
   ];
 
-  const filters = getDateRangeFilters(startTime, endTime, pastHours).concat(sqlFilters, labelFilters);
+  const filters = getDateRangeFilters(startTime, endTime, pastHours).concat(sqlFilters, labelFilters, textSearchFilters);
   // don't query input and output, only query previews
   const { input, output, ...columns } = getTableColumns(spans);
 

diff --git a/frontend/app/blog/[slug]/page.tsx b/frontend/app/blog/[slug]/page.tsx
@@ -48,21 +48,24 @@ export default async function BlogPostPage({ params }: { params: { slug: string
           {/* <ScrollArea className="h-full flex-grow w-full mx-auto bg-background px-16">
             <div className="h-0"> */}
           <BlogMeta data={data} />
-          <div className="pt-12 pb-48">
+          <div className="pt-4 pb-48">
             <MDXRemote
               source={content}
               components={{
                 h1: (props) => <MDHeading props={props} level={0} />,
                 h2: (props) => <MDHeading props={props} level={1} />,
                 h3: (props) => <MDHeading props={props} level={2} />,
                 h4: (props) => <MDHeading props={props} level={3} />,
-                p: (props) => <p className="text-lg py-2" {...props} />,
-                a: (props) => <a className="text-primary underline" {...props} />,
+                p: (props) => <p className="py-2 text-secondary-foreground" {...props} />,
+                a: (props) => <a className="text-primary underline" target="_blank" rel="noopener noreferrer" {...props} />,
                 blockquote: (props) => <blockquote className="border-l-2 border-primary pl-4 py-2" {...props} />,
                 // codeblock
                 pre: (props) => <PreHighlighter className="pl-4 py-4" {...props} />,
                 // inline code
-                code: (props) => <span className="text-lg bg-secondary text-primary font-mono px-0.5" {...props} />,
+                code: (props) => <span className="text-sm bg-secondary text-primary font-mono px-0.5" {...props} />,
+                ul: (props) => <ul className="list-disc pl-4 text-secondary-foreground" {...props} />,
+                ol: (props) => <ol className="list-decimal pl-4 text-secondary-foreground" {...props} />,
+                img: (props) => <img className="w-full border rounded-lg" {...props} />,
               }}
             />
           </div>

diff --git a/frontend/assets/blog/2024-12-01-launch-week-1.mdx b/frontend/assets/blog/2024-12-01-launch-week-1.mdx
@@ -16,5 +16,15 @@ Stay tuned for Laminar launches and follow us on [X](https://x.com/skull88888888
 
 ## Launch day 1, December 2
 
-_Coming soon._
+Flow – a dynamic task engine for building AI agents. See my [X post](https://x.com/skull8888888888/status/1863661536180572412)
+for more details.
 
+## Launch day 2, December 3
+
+Evaluations that just work. Read our [blog post](/blog/2024-12-03-evals)
+for more details.
+
+## Launch day 3, December 4
+
+Semantic search – a way to find the most similar examples in your dataset. Read our
+[blog post](/blog/2024-12-04-semantic-search) for more details.
diff --git a/frontend/assets/blog/2024-12-03-evals.mdx b/frontend/assets/blog/2024-12-03-evals.mdx
@@ -0,0 +1,208 @@
+---
+title: "Launch Week #1, Day 2: Evaluations"
+date: "2024-12-03"
+description: "Evaluations on Laminar"
+author:
+  name: Dinmukhamed Mailibay
+  url: https://x.com/din_mlb
+image: /blog/2024-12-03-evals.png
+tags: ["launch week 1", "evaluations"]
+---
+
+It is no secret that evaluations are a critical part of any system with any
+amount of non-deterministic behavior, and LLMs are not an exception.
+
+Almost every single LLM or AI dev tool has evaluations today, and many people even
+consider evaluations a core part of LLM observability. Many platforms started with observability
+and added evals later, some did the opposite. Evaluations can take many forms, from simple
+deterministic checks to LLM-as-a-judge.
+
+In this blog post, we briefly talk about evaluations in general and what we believe are
+the best practices, and then we dive into how we to use evaluations on Laminar.
+
+## What is an evaluation?
+
+From the meaning of the word itself, evaluation is a process of assessing the value or quality of something.
+In the context of LLMs, it is a process of assessing the output of a model given the prompt and an input.
+
+We like to think of evaluations as unit tests for LLMs. The main difference here is that
+the tests are not deterministic, and the same input can produce different outputs on different runs.
+
+Thus, very simple assertions like `output == expected_output` are not enough, and so the results of
+each evaluation can be broader than just pass or fail.
+
+## Best practices
+
+Here are some of the things we believe work best for evaluations.
+
+### Define evaluations in code
+
+LLM-as-a-judge is powerful, but code gives you much more flexibility. And guess what? You can call
+LLM-as-a-judge in code, so code evaluations are, in a sense, a superset of LLM-as-a-judge.
+
+### Evaluate each step separately
+
+It is very tempting to evaluate the entire execution of a complex agent, especially if you want
+to make sure the agent does not go off-track. While we think there is value in doing so, we
+think that evaluating each step separately is even more important.
+
+The main problem with evaluating the entire execution is that it is hard to tell whether the
+failure is due to a specific step and at what point it happened. There are just too many moving parts.
+
+On the other hand, if we evaluate each step separately, we can pinpoint exactly where the failure is coming from.
+
+### Evaluation dataset should be representative
+
+Ideally, use real production traces as evaluation datasets. If this is not available,
+make sure that the dataset is as similar to production as possible. It is very easy to
+miss important nuances when evaluating on a synthetic dataset, especially if it is created
+without the production examples in mind.
+
+### Adopt evals as early as possible
+
+Start doing evaluations as soon as you can. Track the progress of your prompts' performance
+over time and as you make changes. You will learn a lot about your system and the best ways
+to design it.
+
+## Evaluations on Laminar
+
+We built evaluations on Laminar with all of the above in mind. We stick to the following principles:
+
+- Evaluations are defined in code.
+- An executor (forward run) must be actual production code.
+- Evaluators are just functions that take an executor's output and target output, and return a result.
+- All of the above must be defined by the user.
+- Evaluations are as little intervention as possible, so you can run them manually or in your CI/CD pipeline.
+
+Laminar evaluations are a tool for you to analyze the quality of your system rigorously. We implemented
+them in such a way that they don't get in the way of your development process, and give you the flexibility
+to use them in any way you want.
+
+### Simple evaluation
+
+Suppose you have a step in your agent that decides which tool to call based on the input. It could be
+a simple call to an LLM API with tool definitions, for example:
+
+#### TypeScript
+
+```javascript
+const tools = [
+    {
+        type: "function",
+        function: {
+            name: "my_tool_1",
+            parameters: [
+                // ...
+            ],
+        },
+    }
+    // ...
+];
+
+async function decideTool(input: string) {
+    const response = await openai.chat.completions.create({
+        model: "gpt-4o",
+        messages: [{ role: "user", content: input }],
+        tools,
+    });
+    return response.choices[0].message.tool_calls;
+}
+```
+
+#### Python
+
+```python
+tools = [
+    {
+        type: "function",
+        function: {
+            name: "my_tool_1",
+            parameters: [
+                # ...
+            ],
+        },
+    },
+    # ...
+]
+
+async def decide_tool(input: str) -> list[ToolCall]:
+    response = await openai.chat.completions.create(
+        model="gpt-4o",
+        messages=[{"role": "user", "content": input}],
+        tools=tools,
+    )
+    return response.choices[0].message.tool_calls
+```
+
+Let's say we have collected a dataset of inputs and expected outputs for this function.
+We can define an evaluation for it as follows:
+
+#### TypeScript
+
+```javascript
+// my-eval.ts
+import { evaluate } from '@lmnr-ai/lmnr';
+import { decideTool } from './my-module';
+
+const data = [
+    { data: { input: "input 1" }, target: "tool1" },
+    { data: { input: "input 2" }, target: "tool2" },
+    // ...
+];
+
+evaluate({
+    data,
+    executor: decideTool,
+    evaluators: {
+        exactTool: (output, target) => output.name === target,
+    },
+    config: {
+        projectApiKey: "...", // Your Lamianr project API key
+    }
+});
+```
+
+And then simply run `npx lmnr eval my-eval.ts`.
+
+#### Python
+
+```python
+# my-eval.py
+from lmnr import evaluate
+from my_module import decide_tool
+
+data = [
+    {"data": {"input": "input 1"}, "target": "tool1"},
+    # ...
+]
+
+evaluate(
+    data,
+    executor=decide_tool,
+    evaluators={
+        "exact_tool": lambda output, target: output.name == target,
+    },
+    project_api_key="...",
+)
+```
+
+And then simply run `lmnr eval my-eval.py`.
+
+You will see the results in your Laminar project dashboard.
+
+![Sample evaluation results](/blog/2024-12-03-evals-img-1.png)
+
+And if you repeat this evaluation multiple times, you will see the progression of the evaluation scores over time.
+
+![Evaluation scores over time](/blog/2024-12-03-evals-img-2-score-progress.png)
+
+### Many more features
+
+In addition to the visualization, Laminar evaluations allow you to:
+
+- See the full trace of the entire run, including all LLM calls,
+- Compare scores across runs,
+- Register human labelers to assign scores alongside these programmatic evaluations,
+- Use datasets hosted on Laminar to run evaluations.
+
+Read more in the [docs](https://docs.lmnr.ai/evaluations/introduction).