From 73792c66bafb5f7dec26d8280b746d3c8c0f5f00 Mon Sep 17 00:00:00 2001 From: Stijn Goossens <22433228+StijnGoossens@users.noreply.github.com> Date: Tue, 10 Oct 2023 09:34:59 +0000 Subject: [PATCH] feat: view test results in Streamlit --- README.md | 19 ++- src/llm_app_eval/app.py | 123 ++++++++++++------ .../20231001_evaluate_properties.ipynb | 10 +- 3 files changed, 97 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 7e4a2c6..c4854db 100644 --- a/README.md +++ b/README.md @@ -5,20 +5,17 @@ Evaluates LLM-based applications. ## To-do's -- [x] Convert my EHBO notes into question-answer pairs, using OpenAI Function Calling. -- [/] Turn the question-answer pairs into a test set. -- [x] Build Streamlit app for testing myself. - - [] Bug: when I click on the 'Evaluate' button, the app goes to the next question. +- [x] Convert EHBO notes into question-answer pairs, using OpenAI Function Calling. +- [x] Turn the question-answer pairs into a test set. - [x] Build LLM component to evaluate the given answers by comparing it with the reference answer. - [x] Build LLM 'app' that can answer the questions. - [x] Evaluate the LLM app with the LLM evaluator. -- [] Streamlit page to view the evaluation results. -- [] Add the question-answer pairs as the knowledge base for that app. -- [] Evaluate the LLM app with the LLM evaluator. -- [] Compare the results. -- [] Streamlit page to view, edit and add test cases. -- [] Cache the OpenAI function calls. - +- [x] Streamlit page to view the evaluation results. +- [ ] Combine the evaluation results into a single metric. +- [ ] Evaluate and compare different LLM apps (GPT-3.5, GPT-4, with RAG) +- [ ] Streamlit page to visualize the comparison. +- [ ] Streamlit page to view, edit and add test cases. +- [ ] Integrate with MLflow for experiment tracking (?) ## Using diff --git a/src/llm_app_eval/app.py b/src/llm_app_eval/app.py index 4223ea5..79b8094 100644 --- a/src/llm_app_eval/app.py +++ b/src/llm_app_eval/app.py @@ -1,49 +1,94 @@ """Streamlit app.""" +import json +import os from importlib.metadata import version import numpy as np +import pandas as pd import streamlit as st -from evaluator import Evaluator -from qa_extraction import load_qa_pairs +from eval_properties import properties st.title(f"llm-app-eval v{version('llm-app-eval')}") # type: ignore[no-untyped-call] -qa_pairs = load_qa_pairs("src/llm_app_eval/data/question_answer_pairs.csv") -evaluator = Evaluator(llm="gpt-4") - -# Shuffle the question and answer pairs -np.random.seed(42) -np.random.shuffle(qa_pairs) -# Display a question and answer pair -if "idx" in st.session_state: - idx = st.session_state.idx -else: - idx = 0 - st.session_state.idx = idx -st.write(f"Question {idx + 1} of {len(qa_pairs)}") -qa = qa_pairs[idx] -st.header("Question") -st.write(qa.question) -st.header("Answer") -answer = st.text_input("Answer") -st.header("Reference Answer") -st.write(qa.answer) - - -eval_button = st.button("Evaluate") -if eval_button: - result = evaluator.evaluate(qa.question, answer, qa.answer) - st.write("✅" if result.pass_fail else "❌") - st.write(result.feedback) - st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1) -else: - # Display previous and next buttons - col1, col2, col3 = st.columns(3) - if col1.button("Previous"): - st.session_state.idx = max(st.session_state.idx - 1, 0) - if col2.button("Random"): - st.session_state.idx = np.random.randint(0, len(qa_pairs)) - if col3.button("Next"): - st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1) +TEST_SET_FOLDER = "src/llm_app_eval/data/test_cases" +EVAL_FOLDER = "src/llm_app_eval/data/eval_results" +EVAL_RUNS = ["20231001_175828"] + +# Load all the test cases JSON files +test_cases = {} # type: ignore +for test_case in os.listdir(TEST_SET_FOLDER): + test_case_path = os.path.join(TEST_SET_FOLDER, test_case) + with open(test_case_path) as f: + test_cases[test_case] = json.load(f) + +# Load all the evaluation results JSON files +eval_results = {} # type: ignore +for eval_run in EVAL_RUNS: + eval_results[eval_run] = {} + eval_run_folder = os.path.join(EVAL_FOLDER, eval_run) + for eval_file in os.listdir(eval_run_folder): + eval_file_path = os.path.join(eval_run_folder, eval_file) + with open(eval_file_path) as f: + eval_results[eval_run][eval_file] = json.load(f) + +# Build a matrix for each evaluation run +# Each row is a test case. Each column is a property. +eval_matrices = {} # type: ignore +for eval_run in EVAL_RUNS: + eval_matrices[eval_run] = np.zeros((len(test_cases), len(properties))) + for test_case_idx, test_case in enumerate(test_cases): + for property_idx, prop in enumerate(properties): + r = eval_results[eval_run][test_case] + for property_result in r["property_results"]: + if property_result["property_name"] == prop.property_name: + eval_matrices[eval_run][test_case_idx, property_idx] = property_result[ + "pass_fail" + ] + break + # Turn the matrix into a dataframe + eval_matrices[eval_run] = pd.DataFrame( + eval_matrices[eval_run], + columns=[prop.property_name for prop in properties], + index=list(test_cases), + ) + +st.write(eval_matrices[eval_run]) + +# Select a specific test case +test_case = st.selectbox("Test case", list(test_cases.keys())) # type: ignore + +# Select a specific evaluation run +eval_run = st.selectbox("Evaluation run", EVAL_RUNS) # type: ignore + +# Show the test case input +st.markdown("**Test case input:**") +st.write(test_cases[test_case]["test_input"]["question"]) +# Show the reference_output, historical_output, and historical_feedback, if available +if test_cases[test_case]["reference_output"]: + st.markdown("**Reference output:**") + st.write(test_cases[test_case]["reference_output"]["answer"]) +if test_cases[test_case]["historical_output"]: + st.markdown("**Historical output:**") + st.write(test_cases[test_case]["historical_output"]["answer"]) +if test_cases[test_case]["historical_feedback"]: + st.markdown("**Historical feedback:**") + st.write(test_cases[test_case]["historical_feedback"]) + +# Show the model output +st.markdown("**Model response:**") +st.write(eval_results[eval_run][test_case]["output"]["answer"]) + +# Show the evaluation results +st.markdown("**Evaluation results:**") +# Loop over the properties +for prop in properties: + # Loop over the evaluation runs + for eval_run in EVAL_RUNS: + # Loop over the evaluation results + for property_result in eval_results[eval_run][test_case]["property_results"]: + # If the property name matches the current property, show the result + if property_result["property_name"] == prop.property_name: + st.write(f"{prop.property_name}: {'✅' if property_result['pass_fail'] else '❌'}") + st.write(property_result["feedback"]) diff --git a/src/llm_app_eval/notebooks/20231001_evaluate_properties.ipynb b/src/llm_app_eval/notebooks/20231001_evaluate_properties.ipynb index bf2a90f..f155815 100644 --- a/src/llm_app_eval/notebooks/20231001_evaluate_properties.ipynb +++ b/src/llm_app_eval/notebooks/20231001_evaluate_properties.ipynb @@ -41,28 +41,28 @@ "source": [ "test_cases = [\n", " TestCase(\n", - " test_id=uuid.uuid4().hex,\n", + " test_id=1,\n", " test_input={\"question\": \"Waarom zou het slachtoffer naar de dokter moeten gaan na het Heimlich-manoeuvre?\"},\n", " reference_output={\"answer\": \"Omdat het Heimlich-manoeuvre een interne bloeding kan hebben veroorzaakt.\"},\n", " ),\n", " TestCase(\n", - " test_id=uuid.uuid4().hex,\n", + " test_id=2,\n", " test_input={\"question\": \"Wat zijn de vier stappen van eerste hulp?\"},\n", " reference_output={\"answer\": \"1. Zorg voor veiligheid, 2. Beoordeel de toestand van het slachtoffer, 3. Hulpdiensten verwittigen indien nodig, 4. Verleen verdere eerste hulp.\"},\n", " ),\n", " TestCase(\n", - " test_id=uuid.uuid4().hex,\n", + " test_id=3,\n", " test_input={\"question\": \"Wat is de eerste stap van eerste hulp?\"},\n", " historical_output={\"answer\": \"Zorg voor de veiligheid van het slachtoffer.\"},\n", " historical_feedback=\"Het is belangrijk om ook voor de veiligheid van jezelf en omstaanders te zorgen.\",\n", " ),\n", " TestCase(\n", - " test_id=uuid.uuid4().hex,\n", + " test_id=4,\n", " test_input={\"question\": \"Wat moet je doen als het slachtoffer geen ademhaling heeft?\"},\n", " historical_output={\"answer\": \"Bel 112\"},\n", " ),\n", " TestCase(\n", - " test_id=uuid.uuid4().hex,\n", + " test_id=5,\n", " test_input={\"question\": \"Moet je eten of drinken toedienen in een noodsituatie?\"},\n", " reference_output={\"answer\": \"Nee, behalve bij een hypo (lage bloedsuiker) of hitte- en zonneslag\"},\n", " ),\n",