diff --git a/packages/backend/src/api/v1/evaluations/index.ts b/packages/backend/src/api/v1/evaluations/index.ts index e27cb461..59d6d292 100644 --- a/packages/backend/src/api/v1/evaluations/index.ts +++ b/packages/backend/src/api/v1/evaluations/index.ts @@ -49,15 +49,20 @@ evaluations.post( checks: [], // TODO: remove this legacy col from DB, } - const [insertedEvaluation] = + const [evaluation] = await sql`insert into evaluation ${sql(evaluationToInsert)} returning *` - const evaluation = await getEvaluation(insertedEvaluation.id) + const prompts = await sql` + select * from dataset_prompt where dataset_id = ${datasetId} + ` let count = 0 - for (const prompt of evaluation.dataset.prompts) { - for (const variation of prompt.variations) { + for (const prompt of prompts) { + const variations = await sql` + select * from dataset_prompt_variation where prompt_id = ${prompt.id} + ` + for (const variation of variations) { for (const provider of evaluation.providers) { count++ queue.add(() => @@ -66,7 +71,7 @@ evaluations.post( promptId: prompt.id, variation, provider, - prompt: prompt.content, + prompt: prompt.messages, checklistId, }), ) @@ -121,9 +126,7 @@ evaluations.get( const results = await sql` select *, - p.id as prompt_id, - p.messages as prompt_content - --p.extra as prompt_extra + p.id as prompt_id from evaluation_result er left join dataset_prompt p on p.id = er.prompt_id diff --git a/packages/backend/src/api/v1/evaluations/utils.ts b/packages/backend/src/api/v1/evaluations/utils.ts index 958f6e79..1ffa8d84 100644 --- a/packages/backend/src/api/v1/evaluations/utils.ts +++ b/packages/backend/src/api/v1/evaluations/utils.ts @@ -133,71 +133,3 @@ export async function runEval({ console.error(error) } } - -export async function getEvaluation(evaluationId: string) { - const rows = await sql` - select - e.id as id, - e.created_at as created_at, - e.name as name, - e.project_id as project_id, - e.owner_id as owner_id, - e.providers as providers, - e.checks as checks, - d.id as dataset_id, - d.slug as dataset_slug, - p.id as prompt_id, - p.messages as prompt_messages, - pv.id as variation_id, - pv.variables, - pv.context, - pv.ideal_output - from - evaluation e - left join dataset d on e.dataset_id = d.id - left join dataset_prompt p on d.id = p.dataset_id - left join dataset_prompt_variation pv on pv.prompt_id = p.id - where - e.id = ${evaluationId} - ` - - const { - id, - createdAt, - name, - ownerId, - projectId, - providers, - checks, - datasetId, - datasetSlug, - } = rows[0] - - const evaluation = { - id, - createdAt, - name, - projectId, - ownerId, - providers, - checks, - dataset: { - id: datasetId, - slug: datasetSlug, - prompts: rows.map(({ promptId, promptMessages }) => ({ - id: promptId, - content: promptMessages, - variations: rows - .filter((row) => row.promptId === promptId) - .map(({ variationId, variables, context, idealOutput }) => ({ - id: variationId, - variables, - context, - idealOutput, - })), - })), - }, - } - - return evaluation -} diff --git a/packages/db/0009.sql b/packages/db/0009.sql new file mode 100644 index 00000000..24ed57e6 --- /dev/null +++ b/packages/db/0009.sql @@ -0,0 +1,8 @@ +alter table evaluation add column if not exists checklist_id uuid; +alter table evaluation DROP CONSTRAINT IF EXISTS evaluation_checklist_id_fkey; +alter table evaluation add constraint evaluation_checklist_id_fkey foreign key (checklist_id) references checklist(id) on delete set null; + +drop table if exists evaluation_prompt cascade; +drop table if exists evaluation_prompt_variation cascade; + +alter table evaluation_result add constraint "fk_evaluation_result_prompt_id" foreign key (prompt_id) references dataset_prompt(id) on delete cascade; diff --git a/packages/frontend/components/SmartViewer/Message.tsx b/packages/frontend/components/SmartViewer/Message.tsx index b7309703..2de6bf80 100644 --- a/packages/frontend/components/SmartViewer/Message.tsx +++ b/packages/frontend/components/SmartViewer/Message.tsx @@ -223,6 +223,7 @@ export function ChatMessage({ onChange, compact = false, mah, + ...props }: { data: any editable?: boolean @@ -252,6 +253,7 @@ export function ChatMessage({ })`, borderRadius: 8, }} + {...props} > {!compact && ( <> diff --git a/packages/frontend/components/blocks/Feedback.tsx b/packages/frontend/components/blocks/Feedback.tsx index 4961d6d0..db1f14c1 100644 --- a/packages/frontend/components/blocks/Feedback.tsx +++ b/packages/frontend/components/blocks/Feedback.tsx @@ -34,9 +34,12 @@ export default function Feedback({ }) return ( - + - {/* */} { return JSON.stringify(a) === JSON.stringify(b) } -function getResultForVariation( - promptId: string, - variables: { [key: string]: string }, - provider: Provider, - evalResults, -): any | undefined { - const result = evalResults.find( - (result) => - (promptId ? result.promptId === promptId : true) && - (provider ? compareObjects(result.provider, provider) : true) && - (Object.keys(variables).length === 0 - ? Object.keys(result.variables).length === 0 - : true) && - Object.keys(variables).every( - (variable) => - result.variables.hasOwnProperty(variable) && - result.variables[variable] === variables[variable], - ), - ) - - return result -} const getAggegateForVariation = ( - promptId: string, - provider: Provider, - evalResults, + results, ): { passed: number // percentage passed failed: number // percentage failed duration: number // average duration cost: number // average cost } => { - const results = evalResults.filter( - (result) => - (promptId ? result.promptId === promptId : true) && - (provider ? compareObjects(result.provider, provider) : true), - ) - return { passed: results.filter((result) => result.passed).length, failed: results.filter((result) => !result.passed).length, @@ -64,47 +41,6 @@ const getAggegateForVariation = ( } } -const getVariableVariations = (results) => { - const variations = results.map((result) => result.variables) - const uniqueVariations = Array.from( - new Set(variations.map((variation) => JSON.stringify(variation))), - ).map((variation) => JSON.parse(variation)) - - return uniqueVariations as { [key: string]: string }[] -} - -const getPromptModelVariations = (results) => { - let variations = results.map((result) => ({ - promptContent: result.promptContent, - promptId: result.promptId, - provider: result.provider, - })) - - const uniqueVariations = Array.from( - new Set(variations.map((variation) => JSON.stringify(variation))), - ) - .map((variation) => JSON.parse(variation)) - .map((variation) => { - return { - ...variation, - ...getAggegateForVariation( - variation.promptId, - variation.provider, - results, - ), - } - }) - - return uniqueVariations as { - promptId?: string - promptContent?: any - provider?: Provider - passed: number - failed: number - duration: number - cost: number - }[] -} function ResultDetails({ details }) { if (typeof details !== "object") { return Details not available @@ -127,12 +63,116 @@ function ResultDetails({ details }) { ) } +function ResultCell({ result }) { + return result ? ( + <> + {result.status === "success" ? ( + + + + + + + {result.passed ? "Passed" : "Failed"} + + + + + + + + + {(+result.duration / 1000).toFixed(2)}s -{" "} + {formatCost(result.cost)} + + + + ) : ( + {result.error || "Error"} + )} + + ) : ( + N/A + ) +} + +function AggregateContent({ results }) { + const { passed, failed, duration, cost } = getAggegateForVariation(results) + + return ( + <> + {passed + failed > 1 && ( + + + {`${passed}`} + + + {failed} + + + )} + + {duration && ( + + avg. {duration}s + + )} + {cost && ( + + avg. {formatCost(cost)} + + )} + + + ) +} + export default function ResultsMatrix({ data }) { - const variableVariations = getVariableVariations(data) + const prompts = Array.from( + new Set(data.map((result) => JSON.stringify(result.messages))), + ).map((result: any) => JSON.parse(result)) + + const providers: Provider[] = Array.from( + new Set(data.map((result) => JSON.stringify(result.provider))), + ).map((provider: any) => JSON.parse(provider)) + + function getVariableKeysForPrompt(messages) { + return Object.keys( + data.find((result) => compareObjects(result.messages, messages)) + .variables || {}, + ) + } - const pmVariations = getPromptModelVariations(data) + function getVariableVariationsForPrompt(messages) { + const variations = [ + ...new Set( + data + .filter((result) => compareObjects(result.messages, messages)) + .map((result) => JSON.stringify(result.variables)), + ), + ] - const variables = Array.from(new Set(variableVariations.flatMap(Object.keys))) + return variations.map((variation: any) => JSON.parse(variation)) + } + + function getResultForPromptVariationProvider(messages, variables, provider) { + return data.find( + (result) => + compareObjects(result.messages, messages) && + compareObjects(result.variables, variables) && + compareObjects(result.provider, provider), + ) + } + + const highestNumberOfVariables = Math.max( + ...prompts.map((messages) => getVariableKeysForPrompt(messages).length), + ) return ( @@ -140,156 +180,98 @@ export default function ResultsMatrix({ data }) { - {!!variables.length && ( - - )} - - - - {variables.map((variable, i) => ( - + + {!!highestNumberOfVariables && } + {providers.map((provider, i) => ( + ))} - {pmVariations.map( - ( - { - provider, - promptId, - promptContent, - passed, - failed, - duration, - cost, - }, - index, - ) => { - return ( - - ) - }, - )} - {variableVariations.map((variableVariation, i) => ( - - {variables.map((variable) => ( - - ))} - {pmVariations.map((pmVariation, k) => { - const result = getResultForVariation( - pmVariation.promptId, - variableVariation, - pmVariation.provider, - data, - ) - return ( - + {k === 0 && ( + + )} + {!!highestNumberOfVariables && ( + - ) - })} - - ))} + )} + {providers.map((provider, k) => { + const result = getResultForPromptVariationProvider( + messages, + variableVariation, + provider, + ) + return ( + + ) + })} + + )) + })}
VariablesResults
{variable}PromptVariables + + + + + {MODELS.find((model) => model.id === provider.model) + ?.name || provider.model} + + + + + + + + + + compareObjects(result.provider, provider), + )} + /> + + - - {provider && ( - - - - {MODELS.find( - (model) => model.id === provider.model, - )?.name || provider.model} - - - - - - - - - )} - {promptId && ( - - -
- -
-
- - - -
- )} - {passed + failed > 1 && ( - - - {`${passed}`} - - - {failed} - - - )} - - {duration && ( - - avg. {duration}s - - )} - {cost && ( - - avg. {formatCost(cost)} - - )} - -
-
{variableVariation[variable]} - {result ? ( - <> - {result.status === "success" ? ( - - + {prompts.map((messages, i) => { + const variableKeys = getVariableKeysForPrompt(messages) + const variableVariations = + getVariableVariationsForPrompt(messages) - - - - {result.passed ? "Passed" : "Failed"} - - - - - - - - - {(+result.duration / 1000).toFixed(2)}s -{" "} - {formatCost(result.cost)} - - - - ) : ( - {result.error || "Error"} + return variableVariations.map((variableVariation, k) => ( +
+ + + +
+ +
+
+ + + +
+ + compareObjects(result.messages, messages), )} - - ) : ( - N/A - )} + /> +
+
+ + + {variableKeys.map((variable, l) => ( + + ))} + +
+ + {`{{${variable}}}`} + {variableVariation[variable]} + +
+ +
diff --git a/packages/frontend/components/evals/index.module.css b/packages/frontend/components/evals/index.module.css index baf2fb41..4ce94261 100644 --- a/packages/frontend/components/evals/index.module.css +++ b/packages/frontend/components/evals/index.module.css @@ -6,6 +6,7 @@ border-collapse: collapse; border-spacing: 0; border: 1px solid var(--mantine-color-default-border); + vertical-align: middle; th { background: var(--mantine-color-body); @@ -16,8 +17,16 @@ border: 1px solid var(--mantine-color-default-border); padding: 16px; text-align: center; - min-width: 150px; - width: 200px; + vertical-align: middle; + } + + tr { + height: 1px; + } + + /* first col excluding nested cells */ + > tbody > tr > td:first-of-type { + min-width: 400px; } td.output-cell { @@ -26,4 +35,32 @@ text-align: left !important; } + + td.nested-cell { + padding: 0; + + height: 1px; + } + + td > table { + height: 100%; + width: 100%; + table-layout: fixed; + border-collapse: collapse; + border: none; + vertical-align: middle; + + td { + border-top: none; + border-bottom: none; + } + + tr td:first-child { + border-left: none; + } + + tr td:last-child { + border-right: none; + } + } } diff --git a/packages/frontend/components/layout/Empty.tsx b/packages/frontend/components/layout/Empty.tsx index 4ab2bb97..f21fee7f 100644 --- a/packages/frontend/components/layout/Empty.tsx +++ b/packages/frontend/components/layout/Empty.tsx @@ -98,22 +98,24 @@ export default function Empty({
)} - - Any issue? Get help from a founder. - - - - + {!process.env.NEXT_PUBLIC_IS_SELF_HOSTED && ( + + Any issue? Get help from a founder. + + + + + )} diff --git a/packages/frontend/pages/evaluations/[id].tsx b/packages/frontend/pages/evaluations/[id].tsx index 7520b2bd..4de9470c 100644 --- a/packages/frontend/pages/evaluations/[id].tsx +++ b/packages/frontend/pages/evaluations/[id].tsx @@ -14,7 +14,6 @@ import { Container, Group, Loader, - SegmentedControl, Stack, Text, Title, @@ -23,14 +22,13 @@ import { IconDatabase } from "@tabler/icons-react" import Link from "next/link" import { useRouter } from "next/router" -import { useState } from "react" // We create a matrix of results for each prompt, variable and model. // The matrix is a 3D array, where each dimension represents a different export default function EvalResults() { const router = useRouter() - const [groupBy, setGroupBy] = useState<"none" | "provider" | "prompt">("none") + const id = router.query.id as string const { data, isLoading: loading } = useProjectSWR( @@ -42,14 +40,6 @@ export default function EvalResults() { const { checklist } = useChecklist(evaluation?.checklistId) const { dataset } = useDataset(evaluation?.datasetId) - const uniqueProviders = Array.from( - new Set(data?.map((result) => JSON.stringify(result.provider))), - ) - - const uniquePrompts = Array.from( - new Set(data?.map((result) => result.promptId)), - ) - return ( @@ -88,57 +78,13 @@ export default function EvalResults() { - - Group results by: - - setGroupBy(value as "none" | "provider" | "prompt") - } - /> - - {loading ? ( ) : ( <> {data?.length > 0 ? ( - {groupBy === "none" && } - {groupBy === "provider" && - uniqueProviders.map((model) => ( - JSON.stringify(result.provider) === model, - )} - /> - ))} - {groupBy === "prompt" && - uniquePrompts.map((promptId) => ( - result.promptId === promptId, - )} - /> - ))} + ) : (

No data

diff --git a/packages/frontend/pages/join.tsx b/packages/frontend/pages/join.tsx index 54337c70..bbd5933d 100644 --- a/packages/frontend/pages/join.tsx +++ b/packages/frontend/pages/join.tsx @@ -43,15 +43,17 @@ function TeamFull({ orgName }) { - { - $crisp.push(["do", "chat:open"]) - }} - > - Contact support → - + {!process.env.NEXT_PUBLIC_IS_SELF_HOSTED && ( + { + $crisp.push(["do", "chat:open"]) + }} + > + Contact support → + + )}
@@ -65,7 +67,6 @@ export default function Join() { const [loading, setLoading] = useState(false) const [step, setStep] = useState(1) - const [ssoURI, setSsoURI] = useState(null) useEffect(() => { if (router.isReady) { @@ -153,8 +154,6 @@ export default function Join() { }) if (method === "saml") { - setSsoURI(redirect) - await handleSignup({ email, name, diff --git a/packages/frontend/pages/signup.tsx b/packages/frontend/pages/signup.tsx index 8bc73496..104691d7 100644 --- a/packages/frontend/pages/signup.tsx +++ b/packages/frontend/pages/signup.tsx @@ -460,15 +460,17 @@ function SignupPage() { - + {!process.env.NEXT_PUBLIC_IS_SELF_HOSTED && ( + + )}