From ff59e96b9ca6983cc32617197a123f30df95fb73 Mon Sep 17 00:00:00 2001 From: Stijn Goossens <22433228+StijnGoossens@users.noreply.github.com> Date: Sun, 5 Nov 2023 19:04:04 +0000 Subject: [PATCH] fix: explicitly cast OutputFormat --- src/llm_app_eval/app.py | 10 ++++- src/llm_app_eval/eval_properties.py | 3 +- src/llm_app_eval/evaluator.py | 4 +- src/llm_app_eval/example.ipynb | 64 +++++++++++++++++------------ 4 files changed, 48 insertions(+), 33 deletions(-) diff --git a/src/llm_app_eval/app.py b/src/llm_app_eval/app.py index f143fd4..a57b203 100644 --- a/src/llm_app_eval/app.py +++ b/src/llm_app_eval/app.py @@ -12,10 +12,16 @@ st.title(f"llm-app-eval v{version('llm-app-eval')}") # type: ignore[no-untyped-call] - +# Define the paths to the test cases and evaluation results TEST_SET_FOLDER = "src/llm_app_eval/data/test_cases" EVAL_FOLDER = "src/llm_app_eval/data/eval_results" -EVAL_RUNS = ["20231001_175828"] + +# Create folders if they don't exist +os.makedirs(TEST_SET_FOLDER, exist_ok=True) +os.makedirs(EVAL_FOLDER, exist_ok=True) + +# Get the list of evaluation runs +EVAL_RUNS = os.listdir(EVAL_FOLDER) # Load all the test cases JSON files test_cases = {} # type: ignore diff --git a/src/llm_app_eval/eval_properties.py b/src/llm_app_eval/eval_properties.py index 64c6038..63e2ed6 100644 --- a/src/llm_app_eval/eval_properties.py +++ b/src/llm_app_eval/eval_properties.py @@ -5,7 +5,8 @@ import openai from pydantic import BaseModel -from llm_app_eval.evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase +from llm_app_eval.evaluator import EvalProperty, PropertyResult, TestCase +from llm_app_eval.llm_app import OutputFormat PROPERTY_LLM = "gpt-3.5-turbo-0613" diff --git a/src/llm_app_eval/evaluator.py b/src/llm_app_eval/evaluator.py index 6141cd3..dfde2a3 100644 --- a/src/llm_app_eval/evaluator.py +++ b/src/llm_app_eval/evaluator.py @@ -36,8 +36,6 @@ class TestCaseResult(BaseModel): output: OutputFormat property_results: dict[str, PropertyResult] latency: float - cosine_similarity: Optional[float] = None - verbosity: Optional[float] = None class Evaluator: @@ -149,7 +147,7 @@ def evaluate_app( # Store results as JSON tcr = TestCaseResult( test_case_id=test_case.test_id, - output=app_output, + output=OutputFormat(answer=app_output.answer), property_results=property_results, latency=latency, ) diff --git a/src/llm_app_eval/example.ipynb b/src/llm_app_eval/example.ipynb index e8a4d9d..7bec956 100644 --- a/src/llm_app_eval/example.ipynb +++ b/src/llm_app_eval/example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -61,14 +61,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Define properties.\n", "def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:\n", " return evaluate_property_with_llm(\n", - " model=\"gpt-3.5-turbo-0613\",\n", + " model=\"gpt-4\",\n", " system_message=\"Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.\",\n", " user_message=f\"Answer: {llm_app_result.answer}\\nReference Answer: {test_case.reference_output.answer}\",\n", " )\n", @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -125,9 +125,19 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Evaluating test cases: 100%|██████████| 3/3 [00:23<00:00, 7.75s/test case]\n", + "Evaluating test cases: 100%|██████████| 3/3 [00:20<00:00, 6.79s/test case]\n", + "Evaluating test cases: 100%|██████████| 3/3 [00:29<00:00, 9.91s/test case]\n" + ] + } + ], "source": [ "# Evaluate the LLM apps on the test set by using the properties.\n", "ev = Evaluator(test_set=test_cases, properties=properties, results_dir=\"data/eval_results\")\n", @@ -137,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -174,28 +184,28 @@ " 0\n", " gpt-3.5-turbo-0613\n", " Answer the question.\n", - " 0.893214\n", - " 1.000000\n", - " 8.539927\n", - " 46.845016\n", + " 0.904431\n", + " 0.666667\n", + " 8.370790\n", + " 2.814149\n", " \n", " \n", " 0\n", " gpt-3.5-turbo-0613\n", " You are a first-aid expert. Answer the questio...\n", - " 0.899895\n", + " 0.903006\n", " 0.666667\n", - " 2.762177\n", - " 1.940822\n", + " 2.031130\n", + " 1.684336\n", " \n", " \n", " 0\n", " gpt-4\n", " You are a first-aid expert. Answer the questio...\n", - " 0.908613\n", + " 0.907844\n", " 1.000000\n", - " 5.177815\n", - " 6.374512\n", + " 5.116838\n", + " 5.495382\n", " \n", " \n", "\n", @@ -208,17 +218,17 @@ "0 gpt-4 You are a first-aid expert. Answer the questio... \n", "\n", " CosineSimilarity.score FactuallyConsistent.score Verbosity.score \\\n", - "0 0.893214 1.000000 8.539927 \n", - "0 0.899895 0.666667 2.762177 \n", - "0 0.908613 1.000000 5.177815 \n", + "0 0.904431 0.666667 8.370790 \n", + "0 0.903006 0.666667 2.031130 \n", + "0 0.907844 1.000000 5.116838 \n", "\n", - " latency \n", - "0 46.845016 \n", - "0 1.940822 \n", - "0 6.374512 " + " latency \n", + "0 2.814149 \n", + "0 1.684336 \n", + "0 5.495382 " ] }, - "execution_count": 10, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" }