feat: view test results in Streamlit

superlinear-ai · Oct 10, 2023 · 73792c6 · 73792c6
1 parent a75e167
commit 73792c6
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -5,20 +5,17 @@
 Evaluates LLM-based applications.
 
 ## To-do's
-- [x] Convert my EHBO notes into question-answer pairs, using OpenAI Function Calling.
-- [/] Turn the question-answer pairs into a test set.
-- [x] Build Streamlit app for testing myself.
-    - [] Bug: when I click on the 'Evaluate' button, the app goes to the next question.
+- [x] Convert EHBO notes into question-answer pairs, using OpenAI Function Calling.
+- [x] Turn the question-answer pairs into a test set.
 - [x] Build LLM component to evaluate the given answers by comparing it with the reference answer.
 - [x] Build LLM 'app' that can answer the questions.
 - [x] Evaluate the LLM app with the LLM evaluator.
-- [] Streamlit page to view the evaluation results.
-- [] Add the question-answer pairs as the knowledge base for that app.
-- [] Evaluate the LLM app with the LLM evaluator.
-- [] Compare the results.
-- [] Streamlit page to view, edit and add test cases.
-- [] Cache the OpenAI function calls.
-
+- [x] Streamlit page to view the evaluation results.
+- [ ] Combine the evaluation results into a single metric.
+- [ ] Evaluate and compare different LLM apps (GPT-3.5, GPT-4, with RAG)
+- [ ] Streamlit page to visualize the comparison.
+- [ ] Streamlit page to view, edit and add test cases.
+- [ ] Integrate with MLflow for experiment tracking (?)
 
 
 ## Using

diff --git a/src/llm_app_eval/app.py b/src/llm_app_eval/app.py
@@ -1,49 +1,94 @@
 """Streamlit app."""
 
+import json
+import os
 from importlib.metadata import version
 
 import numpy as np
+import pandas as pd
 import streamlit as st
-from evaluator import Evaluator
-from qa_extraction import load_qa_pairs
+from eval_properties import properties
 
 st.title(f"llm-app-eval v{version('llm-app-eval')}")  # type: ignore[no-untyped-call]
 
 
-qa_pairs = load_qa_pairs("src/llm_app_eval/data/question_answer_pairs.csv")
-evaluator = Evaluator(llm="gpt-4")
-
-# Shuffle the question and answer pairs
-np.random.seed(42)
-np.random.shuffle(qa_pairs)
-# Display a question and answer pair
-if "idx" in st.session_state:
-    idx = st.session_state.idx
-else:
-    idx = 0
-    st.session_state.idx = idx
-st.write(f"Question {idx + 1} of {len(qa_pairs)}")
-qa = qa_pairs[idx]
-st.header("Question")
-st.write(qa.question)
-st.header("Answer")
-answer = st.text_input("Answer")
-st.header("Reference Answer")
-st.write(qa.answer)
-
-
-eval_button = st.button("Evaluate")
-if eval_button:
-    result = evaluator.evaluate(qa.question, answer, qa.answer)
-    st.write("✅" if result.pass_fail else "❌")
-    st.write(result.feedback)
-    st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1)
-else:
-    # Display previous and next buttons
-    col1, col2, col3 = st.columns(3)
-    if col1.button("Previous"):
-        st.session_state.idx = max(st.session_state.idx - 1, 0)
-    if col2.button("Random"):
-        st.session_state.idx = np.random.randint(0, len(qa_pairs))
-    if col3.button("Next"):
-        st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1)
+TEST_SET_FOLDER = "src/llm_app_eval/data/test_cases"
+EVAL_FOLDER = "src/llm_app_eval/data/eval_results"
+EVAL_RUNS = ["20231001_175828"]
+
+# Load all the test cases JSON files
+test_cases = {}  # type: ignore
+for test_case in os.listdir(TEST_SET_FOLDER):
+    test_case_path = os.path.join(TEST_SET_FOLDER, test_case)
+    with open(test_case_path) as f:
+        test_cases[test_case] = json.load(f)
+
+# Load all the evaluation results JSON files
+eval_results = {}  # type: ignore
+for eval_run in EVAL_RUNS:
+    eval_results[eval_run] = {}
+    eval_run_folder = os.path.join(EVAL_FOLDER, eval_run)
+    for eval_file in os.listdir(eval_run_folder):
+        eval_file_path = os.path.join(eval_run_folder, eval_file)
+        with open(eval_file_path) as f:
+            eval_results[eval_run][eval_file] = json.load(f)
+
+# Build a matrix for each evaluation run
+# Each row is a test case. Each column is a property.
+eval_matrices = {}  # type: ignore
+for eval_run in EVAL_RUNS:
+    eval_matrices[eval_run] = np.zeros((len(test_cases), len(properties)))
+    for test_case_idx, test_case in enumerate(test_cases):
+        for property_idx, prop in enumerate(properties):
+            r = eval_results[eval_run][test_case]
+            for property_result in r["property_results"]:
+                if property_result["property_name"] == prop.property_name:
+                    eval_matrices[eval_run][test_case_idx, property_idx] = property_result[
+                        "pass_fail"
+                    ]
+                    break
+    # Turn the matrix into a dataframe
+    eval_matrices[eval_run] = pd.DataFrame(
+        eval_matrices[eval_run],
+        columns=[prop.property_name for prop in properties],
+        index=list(test_cases),
+    )
+
+st.write(eval_matrices[eval_run])
+
+# Select a specific test case
+test_case = st.selectbox("Test case", list(test_cases.keys()))  # type: ignore
+
+# Select a specific evaluation run
+eval_run = st.selectbox("Evaluation run", EVAL_RUNS)  # type: ignore
+
+# Show the test case input
+st.markdown("**Test case input:**")
+st.write(test_cases[test_case]["test_input"]["question"])
+# Show the reference_output, historical_output, and historical_feedback, if available
+if test_cases[test_case]["reference_output"]:
+    st.markdown("**Reference output:**")
+    st.write(test_cases[test_case]["reference_output"]["answer"])
+if test_cases[test_case]["historical_output"]:
+    st.markdown("**Historical output:**")
+    st.write(test_cases[test_case]["historical_output"]["answer"])
+if test_cases[test_case]["historical_feedback"]:
+    st.markdown("**Historical feedback:**")
+    st.write(test_cases[test_case]["historical_feedback"])
+
+# Show the model output
+st.markdown("**Model response:**")
+st.write(eval_results[eval_run][test_case]["output"]["answer"])
+
+# Show the evaluation results
+st.markdown("**Evaluation results:**")
+# Loop over the properties
+for prop in properties:
+    # Loop over the evaluation runs
+    for eval_run in EVAL_RUNS:
+        # Loop over the evaluation results
+        for property_result in eval_results[eval_run][test_case]["property_results"]:
+            # If the property name matches the current property, show the result
+            if property_result["property_name"] == prop.property_name:
+                st.write(f"{prop.property_name}: {'✅' if property_result['pass_fail'] else '❌'}")
+                st.write(property_result["feedback"])
diff --git a/src/llm_app_eval/notebooks/20231001_evaluate_properties.ipynb b/src/llm_app_eval/notebooks/20231001_evaluate_properties.ipynb
@@ -41,28 +41,28 @@
    "source": [
     "test_cases = [\n",
     "    TestCase(\n",
-    "        test_id=uuid.uuid4().hex,\n",
+    "        test_id=1,\n",
     "        test_input={\"question\": \"Waarom zou het slachtoffer naar de dokter moeten gaan na het Heimlich-manoeuvre?\"},\n",
     "        reference_output={\"answer\": \"Omdat het Heimlich-manoeuvre een interne bloeding kan hebben veroorzaakt.\"},\n",
     "    ),\n",
     "    TestCase(\n",
-    "        test_id=uuid.uuid4().hex,\n",
+    "        test_id=2,\n",
     "        test_input={\"question\": \"Wat zijn de vier stappen van eerste hulp?\"},\n",
     "        reference_output={\"answer\": \"1. Zorg voor veiligheid, 2. Beoordeel de toestand van het slachtoffer, 3. Hulpdiensten verwittigen indien nodig, 4. Verleen verdere eerste hulp.\"},\n",
     "    ),\n",
     "    TestCase(\n",
-    "        test_id=uuid.uuid4().hex,\n",
+    "        test_id=3,\n",
     "        test_input={\"question\": \"Wat is de eerste stap van eerste hulp?\"},\n",
     "        historical_output={\"answer\": \"Zorg voor de veiligheid van het slachtoffer.\"},\n",
     "        historical_feedback=\"Het is belangrijk om ook voor de veiligheid van jezelf en omstaanders te zorgen.\",\n",
     "    ),\n",
     "    TestCase(\n",
-    "        test_id=uuid.uuid4().hex,\n",
+    "        test_id=4,\n",
     "        test_input={\"question\": \"Wat moet je doen als het slachtoffer geen ademhaling heeft?\"},\n",
     "        historical_output={\"answer\": \"Bel 112\"},\n",
     "    ),\n",
     "    TestCase(\n",
-    "        test_id=uuid.uuid4().hex,\n",
+    "        test_id=5,\n",
     "        test_input={\"question\": \"Moet je eten of drinken toedienen in een noodsituatie?\"},\n",
     "        reference_output={\"answer\": \"Nee, behalve bij een hypo (lage bloedsuiker) of hitte- en zonneslag\"},\n",
     "    ),\n",