Skip to content

Commit

Permalink
fix: explicitly cast OutputFormat
Browse files Browse the repository at this point in the history
  • Loading branch information
StijnGoossens committed Nov 5, 2023
1 parent e801bfd commit ff59e96
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 33 deletions.
10 changes: 8 additions & 2 deletions src/llm_app_eval/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,16 @@

st.title(f"llm-app-eval v{version('llm-app-eval')}") # type: ignore[no-untyped-call]


# Define the paths to the test cases and evaluation results
TEST_SET_FOLDER = "src/llm_app_eval/data/test_cases"
EVAL_FOLDER = "src/llm_app_eval/data/eval_results"
EVAL_RUNS = ["20231001_175828"]

# Create folders if they don't exist
os.makedirs(TEST_SET_FOLDER, exist_ok=True)
os.makedirs(EVAL_FOLDER, exist_ok=True)

# Get the list of evaluation runs
EVAL_RUNS = os.listdir(EVAL_FOLDER)

# Load all the test cases JSON files
test_cases = {} # type: ignore
Expand Down
3 changes: 2 additions & 1 deletion src/llm_app_eval/eval_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import openai
from pydantic import BaseModel

from llm_app_eval.evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase
from llm_app_eval.evaluator import EvalProperty, PropertyResult, TestCase
from llm_app_eval.llm_app import OutputFormat

PROPERTY_LLM = "gpt-3.5-turbo-0613"

Expand Down
4 changes: 1 addition & 3 deletions src/llm_app_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ class TestCaseResult(BaseModel):
output: OutputFormat
property_results: dict[str, PropertyResult]
latency: float
cosine_similarity: Optional[float] = None
verbosity: Optional[float] = None


class Evaluator:
Expand Down Expand Up @@ -149,7 +147,7 @@ def evaluate_app(
# Store results as JSON
tcr = TestCaseResult(
test_case_id=test_case.test_id,
output=app_output,
output=OutputFormat(answer=app_output.answer),
property_results=property_results,
latency=latency,
)
Expand Down
64 changes: 37 additions & 27 deletions src/llm_app_eval/example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -35,7 +35,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -61,14 +61,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Define properties.\n",
"def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:\n",
" return evaluate_property_with_llm(\n",
" model=\"gpt-3.5-turbo-0613\",\n",
" model=\"gpt-4\",\n",
" system_message=\"Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.\",\n",
" user_message=f\"Answer: {llm_app_result.answer}\\nReference Answer: {test_case.reference_output.answer}\",\n",
" )\n",
Expand Down Expand Up @@ -103,7 +103,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -125,9 +125,19 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Evaluating test cases: 100%|██████████| 3/3 [00:23<00:00, 7.75s/test case]\n",
"Evaluating test cases: 100%|██████████| 3/3 [00:20<00:00, 6.79s/test case]\n",
"Evaluating test cases: 100%|██████████| 3/3 [00:29<00:00, 9.91s/test case]\n"
]
}
],
"source": [
"# Evaluate the LLM apps on the test set by using the properties.\n",
"ev = Evaluator(test_set=test_cases, properties=properties, results_dir=\"data/eval_results\")\n",
Expand All @@ -137,7 +147,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -174,28 +184,28 @@
" <th>0</th>\n",
" <td>gpt-3.5-turbo-0613</td>\n",
" <td>Answer the question.</td>\n",
" <td>0.893214</td>\n",
" <td>1.000000</td>\n",
" <td>8.539927</td>\n",
" <td>46.845016</td>\n",
" <td>0.904431</td>\n",
" <td>0.666667</td>\n",
" <td>8.370790</td>\n",
" <td>2.814149</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>gpt-3.5-turbo-0613</td>\n",
" <td>You are a first-aid expert. Answer the questio...</td>\n",
" <td>0.899895</td>\n",
" <td>0.903006</td>\n",
" <td>0.666667</td>\n",
" <td>2.762177</td>\n",
" <td>1.940822</td>\n",
" <td>2.031130</td>\n",
" <td>1.684336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>gpt-4</td>\n",
" <td>You are a first-aid expert. Answer the questio...</td>\n",
" <td>0.908613</td>\n",
" <td>0.907844</td>\n",
" <td>1.000000</td>\n",
" <td>5.177815</td>\n",
" <td>6.374512</td>\n",
" <td>5.116838</td>\n",
" <td>5.495382</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -208,17 +218,17 @@
"0 gpt-4 You are a first-aid expert. Answer the questio... \n",
"\n",
" CosineSimilarity.score FactuallyConsistent.score Verbosity.score \\\n",
"0 0.893214 1.000000 8.539927 \n",
"0 0.899895 0.666667 2.762177 \n",
"0 0.908613 1.000000 5.177815 \n",
"0 0.904431 0.666667 8.370790 \n",
"0 0.903006 0.666667 2.031130 \n",
"0 0.907844 1.000000 5.116838 \n",
"\n",
" latency \n",
"0 46.845016 \n",
"0 1.940822 \n",
"0 6.374512 "
" latency \n",
"0 2.814149 \n",
"0 1.684336 \n",
"0 5.495382 "
]
},
"execution_count": 10,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand Down

0 comments on commit ff59e96

Please sign in to comment.