From ff59e96b9ca6983cc32617197a123f30df95fb73 Mon Sep 17 00:00:00 2001
From: Stijn Goossens <22433228+StijnGoossens@users.noreply.github.com>
Date: Sun, 5 Nov 2023 19:04:04 +0000
Subject: [PATCH] fix: explicitly cast OutputFormat

---
 src/llm_app_eval/app.py             | 10 ++++-
 src/llm_app_eval/eval_properties.py |  3 +-
 src/llm_app_eval/evaluator.py       |  4 +-
 src/llm_app_eval/example.ipynb      | 64 +++++++++++++++++------------
 4 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/src/llm_app_eval/app.py b/src/llm_app_eval/app.py
index f143fd4..a57b203 100644
--- a/src/llm_app_eval/app.py
+++ b/src/llm_app_eval/app.py
@@ -12,10 +12,16 @@
 
 st.title(f"llm-app-eval v{version('llm-app-eval')}")  # type: ignore[no-untyped-call]
 
-
+# Define the paths to the test cases and evaluation results
 TEST_SET_FOLDER = "src/llm_app_eval/data/test_cases"
 EVAL_FOLDER = "src/llm_app_eval/data/eval_results"
-EVAL_RUNS = ["20231001_175828"]
+
+# Create folders if they don't exist
+os.makedirs(TEST_SET_FOLDER, exist_ok=True)
+os.makedirs(EVAL_FOLDER, exist_ok=True)
+
+# Get the list of evaluation runs
+EVAL_RUNS = os.listdir(EVAL_FOLDER)
 
 # Load all the test cases JSON files
 test_cases = {}  # type: ignore
diff --git a/src/llm_app_eval/eval_properties.py b/src/llm_app_eval/eval_properties.py
index 64c6038..63e2ed6 100644
--- a/src/llm_app_eval/eval_properties.py
+++ b/src/llm_app_eval/eval_properties.py
@@ -5,7 +5,8 @@
 import openai
 from pydantic import BaseModel
 
-from llm_app_eval.evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase
+from llm_app_eval.evaluator import EvalProperty, PropertyResult, TestCase
+from llm_app_eval.llm_app import OutputFormat
 
 PROPERTY_LLM = "gpt-3.5-turbo-0613"
 
diff --git a/src/llm_app_eval/evaluator.py b/src/llm_app_eval/evaluator.py
index 6141cd3..dfde2a3 100644
--- a/src/llm_app_eval/evaluator.py
+++ b/src/llm_app_eval/evaluator.py
@@ -36,8 +36,6 @@ class TestCaseResult(BaseModel):
     output: OutputFormat
     property_results: dict[str, PropertyResult]
     latency: float
-    cosine_similarity: Optional[float] = None
-    verbosity: Optional[float] = None
 
 
 class Evaluator:
@@ -149,7 +147,7 @@ def evaluate_app(
                 # Store results as JSON
                 tcr = TestCaseResult(
                     test_case_id=test_case.test_id,
-                    output=app_output,
+                    output=OutputFormat(answer=app_output.answer),
                     property_results=property_results,
                     latency=latency,
                 )
diff --git a/src/llm_app_eval/example.ipynb b/src/llm_app_eval/example.ipynb
index e8a4d9d..7bec956 100644
--- a/src/llm_app_eval/example.ipynb
+++ b/src/llm_app_eval/example.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,14 +61,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Define properties.\n",
     "def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:\n",
     "    return evaluate_property_with_llm(\n",
-    "        model=\"gpt-3.5-turbo-0613\",\n",
+    "        model=\"gpt-4\",\n",
     "        system_message=\"Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.\",\n",
     "        user_message=f\"Answer: {llm_app_result.answer}\\nReference Answer: {test_case.reference_output.answer}\",\n",
     "    )\n",
@@ -103,7 +103,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -125,9 +125,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Evaluating test cases: 100%|██████████| 3/3 [00:23<00:00,  7.75s/test case]\n",
+      "Evaluating test cases: 100%|██████████| 3/3 [00:20<00:00,  6.79s/test case]\n",
+      "Evaluating test cases: 100%|██████████| 3/3 [00:29<00:00,  9.91s/test case]\n"
+     ]
+    }
+   ],
    "source": [
     "# Evaluate the LLM apps on the test set by using the properties.\n",
     "ev = Evaluator(test_set=test_cases, properties=properties, results_dir=\"data/eval_results\")\n",
@@ -137,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -174,28 +184,28 @@
        "      <th>0</th>\n",
        "      <td>gpt-3.5-turbo-0613</td>\n",
        "      <td>Answer the question.</td>\n",
-       "      <td>0.893214</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>8.539927</td>\n",
-       "      <td>46.845016</td>\n",
+       "      <td>0.904431</td>\n",
+       "      <td>0.666667</td>\n",
+       "      <td>8.370790</td>\n",
+       "      <td>2.814149</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>gpt-3.5-turbo-0613</td>\n",
        "      <td>You are a first-aid expert. Answer the questio...</td>\n",
-       "      <td>0.899895</td>\n",
+       "      <td>0.903006</td>\n",
        "      <td>0.666667</td>\n",
-       "      <td>2.762177</td>\n",
-       "      <td>1.940822</td>\n",
+       "      <td>2.031130</td>\n",
+       "      <td>1.684336</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>gpt-4</td>\n",
        "      <td>You are a first-aid expert. Answer the questio...</td>\n",
-       "      <td>0.908613</td>\n",
+       "      <td>0.907844</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>5.177815</td>\n",
-       "      <td>6.374512</td>\n",
+       "      <td>5.116838</td>\n",
+       "      <td>5.495382</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -208,17 +218,17 @@
        "0               gpt-4  You are a first-aid expert. Answer the questio...   \n",
        "\n",
        "   CosineSimilarity.score  FactuallyConsistent.score  Verbosity.score  \\\n",
-       "0                0.893214                   1.000000         8.539927   \n",
-       "0                0.899895                   0.666667         2.762177   \n",
-       "0                0.908613                   1.000000         5.177815   \n",
+       "0                0.904431                   0.666667         8.370790   \n",
+       "0                0.903006                   0.666667         2.031130   \n",
+       "0                0.907844                   1.000000         5.116838   \n",
        "\n",
-       "     latency  \n",
-       "0  46.845016  \n",
-       "0   1.940822  \n",
-       "0   6.374512  "
+       "    latency  \n",
+       "0  2.814149  \n",
+       "0  1.684336  \n",
+       "0  5.495382  "
       ]
      },
-     "execution_count": 10,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }