Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: evaluation results improvements #399

Merged
merged 4 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 90 additions & 17 deletions packages/frontend/components/evals/ResultsMatrix.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import {
Badge,
Button,
Code,
Group,
HoverCard,
Expand All @@ -12,6 +13,7 @@ import { formatCost } from "@/utils/format"
import { ChatMessage } from "../SmartViewer/Message"
import SmartViewer from "../SmartViewer"
import { MODELS, Provider } from "shared"
import { IconFileExport } from "@tabler/icons-react"

// We create a matrix of results for each prompt, variable and model.
// The matrix is a 3D array, where each dimension represents a different variable, prompt and model.
Expand Down Expand Up @@ -63,23 +65,25 @@ function ResultDetails({ details }) {
)
}

function ResultCell({ result }) {
function ResultCell({ result, showTestIndicator }) {
return result ? (
<>
{result.status === "success" ? (
<Stack align="center" justify="between">
<ChatMessage data={result.output} mah={300} compact w="100%" />

<HoverCard width={500} disabled={!result.results.length}>
<HoverCard.Target>
<Badge color={result.passed ? "green" : "red"}>
{result.passed ? "Passed" : "Failed"}
</Badge>
</HoverCard.Target>
<HoverCard.Dropdown>
<ResultDetails details={result.results} />
</HoverCard.Dropdown>
</HoverCard>
{showTestIndicator && (
<HoverCard width={500} disabled={!result.results.length}>
<HoverCard.Target>
<Badge color={result.passed ? "green" : "red"}>
{result.passed ? "Passed" : "Failed"}
</Badge>
</HoverCard.Target>
<HoverCard.Dropdown>
<ResultDetails details={result.results} />
</HoverCard.Dropdown>
</HoverCard>
)}
<Group gap="xs">
<Text c="dimmed" size="xs">
{(+result.duration / 1000).toFixed(2)}s -{" "}
Expand All @@ -96,12 +100,12 @@ function ResultCell({ result }) {
)
}

function AggregateContent({ results }) {
function AggregateContent({ results, showTestIndicator }) {
const { passed, failed, duration, cost } = getAggegateForVariation(results)

return (
<>
{passed + failed > 1 && (
{passed + failed > 1 && showTestIndicator && (
<Progress.Root size={20} w={100}>
<Progress.Section
value={(passed / (passed + failed)) * 100}
Expand Down Expand Up @@ -133,7 +137,7 @@ function AggregateContent({ results }) {
)
}

export default function ResultsMatrix({ data }) {
export default function ResultsMatrix({ data, showTestIndicator }) {
const prompts = Array.from(
new Set(data.map((result) => JSON.stringify(result.messages))),
).map((result: any) => JSON.parse(result))
Expand Down Expand Up @@ -174,8 +178,72 @@ export default function ResultsMatrix({ data }) {
...prompts.map((messages) => getVariableKeysForPrompt(messages).length),
)

function exportToCsv() {
const columns = [
"Prompt",
"Variable Variation",
"Model",
"Passed",
"Output",
]
const rows = []

prompts.forEach((messages) => {
const variableVariations = getVariableVariationsForPrompt(messages)
variableVariations.forEach((variables) => {
providers.forEach((provider) => {
const result = getResultForPromptVariationProvider(
messages,
variables,
provider,
)
if (result) {
const textResult = result.error
? JSON.stringify(result.error)
: result.output?.content

rows.push([
JSON.stringify(messages),
JSON.stringify(variables),
provider.model,
result.passed ? "Yes" : "No",
`"${textResult.replace(/"/g, '""')}"`, // Escape double quotes and wrap in double quotes
])
}
})
})
})

const csvContent = [
columns.join(","),
...rows.map((row) => row.join(",")),
].join("\n")

const blob = new Blob([csvContent], { type: "text/csv;charset=utf-8;" })
const link = document.createElement("a")
const url = URL.createObjectURL(blob)
link.setAttribute("href", url)
link.setAttribute("download", "results.csv")
link.style.visibility = "hidden"
document.body.appendChild(link)
link.click()
document.body.removeChild(link)
}

return (
<Stack>
<>
<Button
w="fit-content"
ml="auto"
variant="light"
color="blue"
onClick={() => {
exportToCsv()
}}
leftSection={<IconFileExport size={16} />}
>
Export to CSV
</Button>
<div className={classes["matrix-container"]}>
<table className={classes["matrix-table"]}>
<thead>
Expand Down Expand Up @@ -203,6 +271,7 @@ export default function ResultsMatrix({ data }) {
</HoverCard.Dropdown>
</HoverCard>
<AggregateContent
showTestIndicator={showTestIndicator}
results={data.filter((result) =>
compareObjects(result.provider, provider),
)}
Expand Down Expand Up @@ -234,6 +303,7 @@ export default function ResultsMatrix({ data }) {
</HoverCard.Dropdown>
</HoverCard>
<AggregateContent
showTestIndicator={showTestIndicator}
results={data.filter((result) =>
compareObjects(result.messages, messages),
)}
Expand Down Expand Up @@ -265,7 +335,10 @@ export default function ResultsMatrix({ data }) {
)
return (
<td className={classes["output-cell"]} key={k}>
<ResultCell result={result} />
<ResultCell
result={result}
showTestIndicator={showTestIndicator}
/>
</td>
)
})}
Expand All @@ -275,6 +348,6 @@ export default function ResultsMatrix({ data }) {
</tbody>
</table>
</div>
</Stack>
</>
)
}
5 changes: 3 additions & 2 deletions packages/frontend/components/prompts/PromptVariableEditor.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ export default function PromptVariableEditor({
>
<Badge
key={name}
miw={50}
maw={90}
miw="fit-content"
maw={100}
px="sm"
variant="outline"
tt="none"
Expand All @@ -54,6 +54,7 @@ export default function PromptVariableEditor({
w="100%"
required={true}
radius="sm"
placeholder="Enter content here"
rows={1}
maxRows={1}
name={name}
Expand Down
24 changes: 17 additions & 7 deletions packages/frontend/components/prompts/VariableTextarea.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {
ActionIcon,
Box,
Button,
MantineSize,
Modal,
Text,
Textarea,
Expand All @@ -14,11 +15,15 @@ import { IconArrowsMaximize } from "@tabler/icons-react"
type VariableTextareaProps = TextareaProps & {
name: string
value: string
w?: MantineSize
onChange: (event: React.ChangeEvent<HTMLTextAreaElement>) => void
[key: string]: any
}

export default function VariableTextarea({
name,
value,
w,
onChange,
...props
}: VariableTextareaProps) {
Expand All @@ -29,7 +34,7 @@ export default function VariableTextarea({
<Modal
opened={opened}
onClose={close}
title={<Title order={3}>Edit variable</Title>}
title={<Title order={3}>Edit variable content</Title>}
overlayProps={{
backgroundOpacity: 0.55,
blur: 3,
Expand All @@ -38,6 +43,7 @@ export default function VariableTextarea({
>
<Textarea
size="md"
placeholder="Paste variable content here"
radius="sm"
minRows={2}
rows={10}
Expand All @@ -46,12 +52,17 @@ export default function VariableTextarea({
onChange={onChange}
/>

<Button my="md" style={{ float: "right" }} onClick={close}>
<Button
my="md"
variant="default"
style={{ float: "right" }}
onClick={close}
>
Save
</Button>
</Modal>

<Box style={{ position: "relative" }}>
<Box style={{ position: "relative" }} w={w}>
<Textarea {...props} onChange={onChange} value={value} />
<ActionIcon
size="xs"
Expand All @@ -60,12 +71,11 @@ export default function VariableTextarea({
variant="transparent"
style={{
position: "absolute",
right: "5%",
bottom: "5%",
marginBottom: "0.3rem",
right: "10px",
bottom: "7px",
}}
>
<IconArrowsMaximize />
<IconArrowsMaximize size={14} />
</ActionIcon>
</Box>
</>
Expand Down
5 changes: 3 additions & 2 deletions packages/frontend/pages/datasets/[id].tsx
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,10 @@ function PromptVariation({ i, variationId, content, onDelete, markSaved }) {
label="Ideal output (optional)"
description="Useful for assessing the proximity of the LLM response to an anticipated output."
required={false}
placeholder="What would be the ideal output for this variation?"
autosize
maxRows={6}
minRows={3}
maxRows={8}
minRows={1}
value={variation?.idealOutput || ""}
onChange={(e) => setIdealOutput(e.target.value)}
/>
Expand Down
8 changes: 7 additions & 1 deletion packages/frontend/pages/evaluations/[id].tsx
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ export default function EvalResults() {
const { checklist } = useChecklist(evaluation?.checklistId)
const { dataset } = useDataset(evaluation?.datasetId)

// Only show 'Passed' labels if a checklist was used
const showTestIndicator = !!checklist

return (
<Container size="100%">
<Stack>
Expand Down Expand Up @@ -84,7 +87,10 @@ export default function EvalResults() {
<>
{data?.length > 0 ? (
<Stack gap="xl">
<ResultsMatrix data={data} />
<ResultsMatrix
data={data}
showTestIndicator={showTestIndicator}
/>
</Stack>
) : (
<p>No data</p>
Expand Down
Loading