docs: add diagram to readme

superlinear-ai · Nov 4, 2023 · e801bfd · e801bfd
1 parent 6e46e3d
commit e801bfd
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -1,22 +1,16 @@
 [![Open in Dev Containers](https://img.shields.io/static/v1?label=Dev%20Containers&message=Open&color=blue&logo=visualstudiocode)](https://vscode.dev/redirect?url=vscode://ms-vscode-remote.remote-containers/cloneInVolume?url=https://github.com/StijnGoossens/llm-app-eval)
 
-# llm-app-eval
-
-Evaluates LLM-based applications.
-
-**Check the `example.ipynb` notebook for an example of how to use this package.**
-
-## To-do's
-- [x] Convert EHBO notes into question-answer pairs, using OpenAI Function Calling.
-- [x] Turn the question-answer pairs into a test set.
-- [x] Build LLM component to evaluate the given answers by comparing it with the reference answer.
-- [x] Build LLM 'app' that can answer the questions.
-- [x] Evaluate the LLM app with the LLM evaluator.
-- [x] Streamlit page to view the evaluation results.
-- [x] Combine the evaluation results into a single metric.
-- [x] Evaluate and compare different LLM apps (GPT-3.5, GPT-4, with RAG)
-- [x] Integrate with MLflow for experiment tracking.
-- [ ] Streamlit page to view, edit and add test cases.
+# Evaluation of LLM-based applications
+
+An implementation of the principles of evaluating LLM-based applications. This repository accompanies the blog post ['Steady the Course: Navigating the Evaluation of LLM-based Applications'](https://medium.com/@stijn.sg.goossens/steady-the-course-navigating-the-evaluation-of-llm-based-applications-8b7a22734fc9).
+
+💡 Check out the [example notebook](src/llm_app_eval/example.ipynb) for an end-to-end illustration of the most important concepts (LLM app, test case, test properties and Evaluator), including the integration with MLflow.
+
+🔑 Add your OpenAI API key to a file named `openai_key` in the root directory before running the notebook.
+
+The image below shows an architectural overview of the evaluation framework and illustrates an important feedback loop. See the aforementioned blog post for more information. The scope of this repository is indicated by the green box.
+
+![Evaluation feedback loop](images/evaluation_feedback_loop.png)
 
 ## Using
 

diff --git a/images/evaluation_feedback_loop.png b/images/evaluation_feedback_loop.png
diff --git a/src/llm_app_eval/app.py b/src/llm_app_eval/app.py
@@ -7,7 +7,8 @@
 import numpy as np
 import pandas as pd
 import streamlit as st
-from eval_properties import properties
+
+from llm_app_eval.eval_properties import properties
 
 st.title(f"llm-app-eval v{version('llm-app-eval')}")  # type: ignore[no-untyped-call]
 

diff --git a/src/llm_app_eval/eval_properties.py b/src/llm_app_eval/eval_properties.py
@@ -1,10 +1,12 @@
 from functools import lru_cache
+from typing import Optional
 
 import numpy as np
 import openai
-from evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase
 from pydantic import BaseModel
 
+from llm_app_eval.evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase
+
 PROPERTY_LLM = "gpt-3.5-turbo-0613"
 
 
@@ -18,7 +20,9 @@ def cosine_similarity(a, b):
     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
 
 
-def output_similarity(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+def output_similarity(
+    test_case: TestCase, llm_app_result: OutputFormat
+) -> Optional[PropertyResult]:
     if test_case.reference_output and llm_app_result.answer:
         app_output_emb = get_embedding(llm_app_result.answer)
         reference_emb = get_embedding(test_case.reference_output.answer)
@@ -31,7 +35,7 @@ def output_similarity(test_case: TestCase, llm_app_result: OutputFormat) -> Prop
     return result
 
 
-def output_verbosity(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+def output_verbosity(test_case: TestCase, llm_app_result: OutputFormat) -> Optional[PropertyResult]:
     if test_case.reference_output and llm_app_result.answer:
         result = PropertyResult(
             feedback="", score=len(llm_app_result.answer) / len(test_case.reference_output.answer)
@@ -59,7 +63,9 @@ def evaluate_property_with_llm(
     )
 
 
-def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+def factually_consistent(
+    test_case: TestCase, llm_app_result: OutputFormat
+) -> Optional[PropertyResult]:
     if test_case.reference_output and llm_app_result.answer:
         result = evaluate_property_with_llm(
             model=PROPERTY_LLM,
@@ -71,7 +77,9 @@ def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> P
     return result
 
 
-def improves_historical_answer(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+def improves_historical_answer(
+    test_case: TestCase, llm_app_result: OutputFormat
+) -> Optional[PropertyResult]:
     if test_case.test_input and test_case.historical_output and llm_app_result.answer:
         result = evaluate_property_with_llm(
             model=PROPERTY_LLM,
@@ -85,7 +93,7 @@ def improves_historical_answer(test_case: TestCase, llm_app_result: OutputFormat
 
 def takes_feedback_into_account(
     test_case: TestCase, llm_app_result: OutputFormat
-) -> PropertyResult:
+) -> Optional[PropertyResult]:
     if (
         test_case.test_input
         and test_case.historical_output
@@ -102,12 +110,14 @@ def takes_feedback_into_account(
     return result
 
 
-def length_within_bounds(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+def length_within_bounds(
+    test_case: TestCase, llm_app_result: OutputFormat
+) -> Optional[PropertyResult]:
     if test_case.reference_output and llm_app_result.answer:
         if len(llm_app_result.answer) <= 1.2 * len(test_case.reference_output.answer):
-            result = PropertyResult(feedback="The answer is not too long.", pass_fail=True)
+            result = PropertyResult(feedback="The answer is not too long.", score=1)
         else:
-            result = PropertyResult(feedback="The answer is too long.", pass_fail=False)
+            result = PropertyResult(feedback="The answer is too long.", score=0)
     else:
         result = None
     return result

diff --git a/src/llm_app_eval/evaluator.py b/src/llm_app_eval/evaluator.py
@@ -6,10 +6,11 @@
 
 import mlflow
 import pandas as pd
-from llm_app import BaseApp, InputFormat, OutputFormat
 from pydantic import BaseModel
 from tqdm import tqdm
 
+from llm_app_eval.llm_app import BaseApp, InputFormat, OutputFormat
+
 
 class TestCase(BaseModel):
     test_id: str