From ff59e96b9ca6983cc32617197a123f30df95fb73 Mon Sep 17 00:00:00 2001 From: Stijn Goossens <22433228+StijnGoossens@users.noreply.github.com> Date: Sun, 5 Nov 2023 19:04:04 +0000 Subject: [PATCH] fix: explicitly cast OutputFormat --- src/llm_app_eval/app.py | 10 ++++- src/llm_app_eval/eval_properties.py | 3 +- src/llm_app_eval/evaluator.py | 4 +- src/llm_app_eval/example.ipynb | 64 +++++++++++++++++------------ 4 files changed, 48 insertions(+), 33 deletions(-) diff --git a/src/llm_app_eval/app.py b/src/llm_app_eval/app.py index f143fd4..a57b203 100644 --- a/src/llm_app_eval/app.py +++ b/src/llm_app_eval/app.py @@ -12,10 +12,16 @@ st.title(f"llm-app-eval v{version('llm-app-eval')}") # type: ignore[no-untyped-call] - +# Define the paths to the test cases and evaluation results TEST_SET_FOLDER = "src/llm_app_eval/data/test_cases" EVAL_FOLDER = "src/llm_app_eval/data/eval_results" -EVAL_RUNS = ["20231001_175828"] + +# Create folders if they don't exist +os.makedirs(TEST_SET_FOLDER, exist_ok=True) +os.makedirs(EVAL_FOLDER, exist_ok=True) + +# Get the list of evaluation runs +EVAL_RUNS = os.listdir(EVAL_FOLDER) # Load all the test cases JSON files test_cases = {} # type: ignore diff --git a/src/llm_app_eval/eval_properties.py b/src/llm_app_eval/eval_properties.py index 64c6038..63e2ed6 100644 --- a/src/llm_app_eval/eval_properties.py +++ b/src/llm_app_eval/eval_properties.py @@ -5,7 +5,8 @@ import openai from pydantic import BaseModel -from llm_app_eval.evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase +from llm_app_eval.evaluator import EvalProperty, PropertyResult, TestCase +from llm_app_eval.llm_app import OutputFormat PROPERTY_LLM = "gpt-3.5-turbo-0613" diff --git a/src/llm_app_eval/evaluator.py b/src/llm_app_eval/evaluator.py index 6141cd3..dfde2a3 100644 --- a/src/llm_app_eval/evaluator.py +++ b/src/llm_app_eval/evaluator.py @@ -36,8 +36,6 @@ class TestCaseResult(BaseModel): output: OutputFormat property_results: dict[str, PropertyResult] latency: float - cosine_similarity: Optional[float] = None - verbosity: Optional[float] = None class Evaluator: @@ -149,7 +147,7 @@ def evaluate_app( # Store results as JSON tcr = TestCaseResult( test_case_id=test_case.test_id, - output=app_output, + output=OutputFormat(answer=app_output.answer), property_results=property_results, latency=latency, ) diff --git a/src/llm_app_eval/example.ipynb b/src/llm_app_eval/example.ipynb index e8a4d9d..7bec956 100644 --- a/src/llm_app_eval/example.ipynb +++ b/src/llm_app_eval/example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -61,14 +61,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Define properties.\n", "def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:\n", " return evaluate_property_with_llm(\n", - " model=\"gpt-3.5-turbo-0613\",\n", + " model=\"gpt-4\",\n", " system_message=\"Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.\",\n", " user_message=f\"Answer: {llm_app_result.answer}\\nReference Answer: {test_case.reference_output.answer}\",\n", " )\n", @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -125,9 +125,19 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Evaluating test cases: 100%|██████████| 3/3 [00:23<00:00, 7.75s/test case]\n", + "Evaluating test cases: 100%|██████████| 3/3 [00:20<00:00, 6.79s/test case]\n", + "Evaluating test cases: 100%|██████████| 3/3 [00:29<00:00, 9.91s/test case]\n" + ] + } + ], "source": [ "# Evaluate the LLM apps on the test set by using the properties.\n", "ev = Evaluator(test_set=test_cases, properties=properties, results_dir=\"data/eval_results\")\n", @@ -137,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -174,28 +184,28 @@ "