Skip to content

Commit

Permalink
Merge pull request #259 from lmnr-ai/dev
Browse files Browse the repository at this point in the history
Allow disabling tracing, search by spans
  • Loading branch information
dinmukhamedm authored Dec 4, 2024
2 parents d244713 + 2aabcf2 commit 8b91d3a
Show file tree
Hide file tree
Showing 16 changed files with 509 additions and 10 deletions.
8 changes: 8 additions & 0 deletions app-server/src/traces/producer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ pub async fn push_spans_to_queue(
for otel_span in scope_span.spans {
let mut span = Span::from_otel_span(otel_span.clone());

if !span.should_save() {
continue;
}

let span_usage = super::utils::get_llm_usage_for_span(
&mut span.get_attributes(),
db.clone(),
Expand Down Expand Up @@ -94,6 +98,10 @@ pub async fn push_spans_to_queue(
}
}

if !span.should_save() {
continue;
}

let rabbitmq_span_message = RabbitMqSpanMessage {
project_id,
span,
Expand Down
25 changes: 24 additions & 1 deletion app-server/src/traces/spans.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::{collections::HashMap, sync::Arc};
use anyhow::Result;
use chrono::{TimeZone, Utc};
use regex::Regex;
use serde::Serialize;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use uuid::Uuid;

Expand Down Expand Up @@ -38,6 +38,14 @@ const OUTPUT_ATTRIBUTE_NAME: &str = "lmnr.span.output";
/// null. We hackily use this when we wrap a span in a NonRecordingSpan that
/// is not sent to the backend – this is done to overwrite trace IDs for spans.
const OVERRIDE_PARENT_SPAN_ATTRIBUTE_NAME: &str = "lmnr.internal.override_parent_span";
const TRACING_LEVEL_ATTRIBUTE_NAME: &str = "lmnr.internal.tracing_level";

#[derive(Debug, Clone, PartialEq, Deserialize)]
#[serde(rename_all = "snake_case")]
enum TracingLevel {
Off,
MetaOnly,
}

pub struct SpanAttributes {
pub attributes: HashMap<String, Value>,
Expand Down Expand Up @@ -239,6 +247,12 @@ impl SpanAttributes {
}
res
}

fn tracing_level(&self) -> Option<TracingLevel> {
self.attributes
.get(TRACING_LEVEL_ATTRIBUTE_NAME)
.and_then(|s| serde_json::from_value(s.clone()).ok())
}
}

impl Span {
Expand All @@ -253,6 +267,10 @@ impl Span {
self.attributes = serde_json::to_value(&attributes.attributes).unwrap();
}

pub fn should_save(&self) -> bool {
self.get_attributes().tracing_level() != Some(TracingLevel::Off)
}

/// Create a span from an OpenTelemetry span.
///
/// This is called on the producer side of the MQ, i.e. at the OTel ingester
Expand Down Expand Up @@ -379,6 +397,11 @@ impl Span {
span.parent_span_id = None;
}

if let Some(TracingLevel::MetaOnly) = span.get_attributes().tracing_level() {
span.input = None;
span.output = None;
}

span
}

Expand Down
12 changes: 10 additions & 2 deletions frontend/app/api/projects/[projectId]/spans/route.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { and, desc, eq, getTableColumns, inArray, sql } from 'drizzle-orm';
import { and, desc, eq, getTableColumns, inArray, or, sql } from 'drizzle-orm';
import { FilterDef, filtersToSql } from '@/lib/db/modifiers';
import { getDateRangeFilters, paginatedGet } from '@/lib/db/utils';
import { labelClasses, labels, spans, traces } from '@/lib/db/migrations/schema';
Expand Down Expand Up @@ -50,6 +50,14 @@ export async function GET(
);
});

const textSearch = req.nextUrl.searchParams.get("search");
const textSearchFilters = textSearch ? [
or(
sql`input::text LIKE ${`%${textSearch}%`}::text`,
sql`output::text LIKE ${`%${textSearch}%`}::text`
)!
] : [];

urlParamFilters = urlParamFilters
// labels are handled separately above
.filter(filter => filter.column !== "labels")
Expand Down Expand Up @@ -103,7 +111,7 @@ export async function GET(
sql`project_id = ${projectId}`
];

const filters = getDateRangeFilters(startTime, endTime, pastHours).concat(sqlFilters, labelFilters);
const filters = getDateRangeFilters(startTime, endTime, pastHours).concat(sqlFilters, labelFilters, textSearchFilters);
// don't query input and output, only query previews
const { input, output, ...columns } = getTableColumns(spans);

Expand Down
11 changes: 7 additions & 4 deletions frontend/app/blog/[slug]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,24 @@ export default async function BlogPostPage({ params }: { params: { slug: string
{/* <ScrollArea className="h-full flex-grow w-full mx-auto bg-background px-16">
<div className="h-0"> */}
<BlogMeta data={data} />
<div className="pt-12 pb-48">
<div className="pt-4 pb-48">
<MDXRemote
source={content}
components={{
h1: (props) => <MDHeading props={props} level={0} />,
h2: (props) => <MDHeading props={props} level={1} />,
h3: (props) => <MDHeading props={props} level={2} />,
h4: (props) => <MDHeading props={props} level={3} />,
p: (props) => <p className="text-lg py-2" {...props} />,
a: (props) => <a className="text-primary underline" {...props} />,
p: (props) => <p className="py-2 text-secondary-foreground" {...props} />,
a: (props) => <a className="text-primary underline" target="_blank" rel="noopener noreferrer" {...props} />,
blockquote: (props) => <blockquote className="border-l-2 border-primary pl-4 py-2" {...props} />,
// codeblock
pre: (props) => <PreHighlighter className="pl-4 py-4" {...props} />,
// inline code
code: (props) => <span className="text-lg bg-secondary text-primary font-mono px-0.5" {...props} />,
code: (props) => <span className="text-sm bg-secondary text-primary font-mono px-0.5" {...props} />,
ul: (props) => <ul className="list-disc pl-4 text-secondary-foreground" {...props} />,
ol: (props) => <ol className="list-decimal pl-4 text-secondary-foreground" {...props} />,
img: (props) => <img className="w-full border rounded-lg" {...props} />,
}}
/>
</div>
Expand Down
12 changes: 11 additions & 1 deletion frontend/assets/blog/2024-12-01-launch-week-1.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,15 @@ Stay tuned for Laminar launches and follow us on [X](https://x.com/skull88888888

## Launch day 1, December 2

_Coming soon._
Flow – a dynamic task engine for building AI agents. See my [X post](https://x.com/skull8888888888/status/1863661536180572412)
for more details.

## Launch day 2, December 3

Evaluations that just work. Read our [blog post](/blog/2024-12-03-evals)
for more details.

## Launch day 3, December 4

Semantic search – a way to find the most similar examples in your dataset. Read our
[blog post](/blog/2024-12-04-semantic-search) for more details.
208 changes: 208 additions & 0 deletions frontend/assets/blog/2024-12-03-evals.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
---
title: "Launch Week #1, Day 2: Evaluations"
date: "2024-12-03"
description: "Evaluations on Laminar"
author:
name: Dinmukhamed Mailibay
url: https://x.com/din_mlb
image: /blog/2024-12-03-evals.png
tags: ["launch week 1", "evaluations"]
---

It is no secret that evaluations are a critical part of any system with any
amount of non-deterministic behavior, and LLMs are not an exception.

Almost every single LLM or AI dev tool has evaluations today, and many people even
consider evaluations a core part of LLM observability. Many platforms started with observability
and added evals later, some did the opposite. Evaluations can take many forms, from simple
deterministic checks to LLM-as-a-judge.

In this blog post, we briefly talk about evaluations in general and what we believe are
the best practices, and then we dive into how we to use evaluations on Laminar.

## What is an evaluation?

From the meaning of the word itself, evaluation is a process of assessing the value or quality of something.
In the context of LLMs, it is a process of assessing the output of a model given the prompt and an input.

We like to think of evaluations as unit tests for LLMs. The main difference here is that
the tests are not deterministic, and the same input can produce different outputs on different runs.

Thus, very simple assertions like `output == expected_output` are not enough, and so the results of
each evaluation can be broader than just pass or fail.

## Best practices

Here are some of the things we believe work best for evaluations.

### Define evaluations in code

LLM-as-a-judge is powerful, but code gives you much more flexibility. And guess what? You can call
LLM-as-a-judge in code, so code evaluations are, in a sense, a superset of LLM-as-a-judge.

### Evaluate each step separately

It is very tempting to evaluate the entire execution of a complex agent, especially if you want
to make sure the agent does not go off-track. While we think there is value in doing so, we
think that evaluating each step separately is even more important.

The main problem with evaluating the entire execution is that it is hard to tell whether the
failure is due to a specific step and at what point it happened. There are just too many moving parts.

On the other hand, if we evaluate each step separately, we can pinpoint exactly where the failure is coming from.

### Evaluation dataset should be representative

Ideally, use real production traces as evaluation datasets. If this is not available,
make sure that the dataset is as similar to production as possible. It is very easy to
miss important nuances when evaluating on a synthetic dataset, especially if it is created
without the production examples in mind.

### Adopt evals as early as possible

Start doing evaluations as soon as you can. Track the progress of your prompts' performance
over time and as you make changes. You will learn a lot about your system and the best ways
to design it.

## Evaluations on Laminar

We built evaluations on Laminar with all of the above in mind. We stick to the following principles:

- Evaluations are defined in code.
- An executor (forward run) must be actual production code.
- Evaluators are just functions that take an executor's output and target output, and return a result.
- All of the above must be defined by the user.
- Evaluations are as little intervention as possible, so you can run them manually or in your CI/CD pipeline.

Laminar evaluations are a tool for you to analyze the quality of your system rigorously. We implemented
them in such a way that they don't get in the way of your development process, and give you the flexibility
to use them in any way you want.

### Simple evaluation

Suppose you have a step in your agent that decides which tool to call based on the input. It could be
a simple call to an LLM API with tool definitions, for example:

#### TypeScript

```javascript
const tools = [
{
type: "function",
function: {
name: "my_tool_1",
parameters: [
// ...
],
},
}
// ...
];

async function decideTool(input: string) {
const response = await openai.chat.completions.create({
model: "gpt-4o",
messages: [{ role: "user", content: input }],
tools,
});
return response.choices[0].message.tool_calls;
}
```

#### Python

```python
tools = [
{
type: "function",
function: {
name: "my_tool_1",
parameters: [
# ...
],
},
},
# ...
]

async def decide_tool(input: str) -> list[ToolCall]:
response = await openai.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": input}],
tools=tools,
)
return response.choices[0].message.tool_calls
```

Let's say we have collected a dataset of inputs and expected outputs for this function.
We can define an evaluation for it as follows:

#### TypeScript

```javascript
// my-eval.ts
import { evaluate } from '@lmnr-ai/lmnr';
import { decideTool } from './my-module';

const data = [
{ data: { input: "input 1" }, target: "tool1" },
{ data: { input: "input 2" }, target: "tool2" },
// ...
];

evaluate({
data,
executor: decideTool,
evaluators: {
exactTool: (output, target) => output.name === target,
},
config: {
projectApiKey: "...", // Your Lamianr project API key
}
});
```

And then simply run `npx lmnr eval my-eval.ts`.

#### Python

```python
# my-eval.py
from lmnr import evaluate
from my_module import decide_tool

data = [
{"data": {"input": "input 1"}, "target": "tool1"},
# ...
]

evaluate(
data,
executor=decide_tool,
evaluators={
"exact_tool": lambda output, target: output.name == target,
},
project_api_key="...",
)
```

And then simply run `lmnr eval my-eval.py`.

You will see the results in your Laminar project dashboard.

![Sample evaluation results](/blog/2024-12-03-evals-img-1.png)

And if you repeat this evaluation multiple times, you will see the progression of the evaluation scores over time.

![Evaluation scores over time](/blog/2024-12-03-evals-img-2-score-progress.png)

### Many more features

In addition to the visualization, Laminar evaluations allow you to:

- See the full trace of the entire run, including all LLM calls,
- Compare scores across runs,
- Register human labelers to assign scores alongside these programmatic evaluations,
- Use datasets hosted on Laminar to run evaluations.

Read more in the [docs](https://docs.lmnr.ai/evaluations/introduction).
Loading

0 comments on commit 8b91d3a

Please sign in to comment.