chore: remove unnecessary files

superlinear-ai · Nov 8, 2023 · 1d4a68b · 1d4a68b
1 parent ff59e96
commit 1d4a68b
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 584 deletions.
diff --git a/README.md b/README.md
@@ -4,20 +4,18 @@
 
 An implementation of the principles of evaluating LLM-based applications. This repository accompanies the blog post ['Steady the Course: Navigating the Evaluation of LLM-based Applications'](https://medium.com/@stijn.sg.goossens/steady-the-course-navigating-the-evaluation-of-llm-based-applications-8b7a22734fc9).
 
-💡 Check out the [example notebook](src/llm_app_eval/example.ipynb) for an end-to-end illustration of the most important concepts (LLM app, test case, test properties and Evaluator), including the integration with MLflow.
+💡 Check out the [example notebook](src/llm_app_eval/example.ipynb) for an end-to-end illustration of the most important concepts (LLM app, test case, test properties and Evaluator), including the integration with [MLflow](https://github.com/mlflow/mlflow/tree/master).
 
 🔑 Add your OpenAI API key to a file named `openai_key` in the root directory before running the notebook.
 
-The image below shows an architectural overview of the evaluation framework and illustrates an important feedback loop. See the aforementioned blog post for more information. The scope of this repository is indicated by the green box.
+The image below shows the evaluation framework and illustrates an important feedback loop to improve your LLM app further. See the aforementioned blog post for more information. The scope of this repository is indicated by the green box.
 
 ![Evaluation feedback loop](images/evaluation_feedback_loop.png)
 
 ## Using
 
 _Python package_: to add and install this package as a dependency of your project, run `poetry add llm-app-eval`.
 
-_Python application_: to serve this Streamlit app, run `docker compose up app` and open [localhost:8000](http://localhost:8000) in your browser. Within the Dev Container, this is equivalent to running `poe app`.
-
 ## Contributing
 
 <details>

diff --git a/src/llm_app_eval/app.py b/src/llm_app_eval/app.py
diff --git a/src/llm_app_eval/example.ipynb b/src/llm_app_eval/example.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -103,7 +103,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -125,16 +125,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Evaluating test cases: 100%|██████████| 3/3 [00:23<00:00,  7.75s/test case]\n",
-      "Evaluating test cases: 100%|██████████| 3/3 [00:20<00:00,  6.79s/test case]\n",
-      "Evaluating test cases: 100%|██████████| 3/3 [00:29<00:00,  9.91s/test case]\n"
+      "Evaluating test cases: 100%|██████████| 3/3 [00:30<00:00, 10.06s/test case]\n",
+      "Evaluating test cases: 100%|██████████| 3/3 [00:19<00:00,  6.37s/test case]\n",
+      "Evaluating test cases: 100%|██████████| 3/3 [00:43<00:00, 14.59s/test case]\n"
      ]
     }
    ],
@@ -147,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -184,28 +184,28 @@
        "      <th>0</th>\n",
        "      <td>gpt-3.5-turbo-0613</td>\n",
        "      <td>Answer the question.</td>\n",
-       "      <td>0.904431</td>\n",
-       "      <td>0.666667</td>\n",
-       "      <td>8.370790</td>\n",
-       "      <td>2.814149</td>\n",
+       "      <td>0.891321</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>8.454041</td>\n",
+       "      <td>2.887883</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>gpt-3.5-turbo-0613</td>\n",
        "      <td>You are a first-aid expert. Answer the questio...</td>\n",
-       "      <td>0.903006</td>\n",
-       "      <td>0.666667</td>\n",
-       "      <td>2.031130</td>\n",
-       "      <td>1.684336</td>\n",
+       "      <td>0.895463</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>5.316218</td>\n",
+       "      <td>2.421593</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>gpt-4</td>\n",
        "      <td>You are a first-aid expert. Answer the questio...</td>\n",
-       "      <td>0.907844</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>5.116838</td>\n",
-       "      <td>5.495382</td>\n",
+       "      <td>0.899324</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>6.696650</td>\n",
+       "      <td>9.665427</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -218,17 +218,17 @@
        "0               gpt-4  You are a first-aid expert. Answer the questio...   \n",
        "\n",
        "   CosineSimilarity.score  FactuallyConsistent.score  Verbosity.score  \\\n",
-       "0                0.904431                   0.666667         8.370790   \n",
-       "0                0.903006                   0.666667         2.031130   \n",
-       "0                0.907844                   1.000000         5.116838   \n",
+       "0                0.891321                        1.0         8.454041   \n",
+       "0                0.895463                        1.0         5.316218   \n",
+       "0                0.899324                        1.0         6.696650   \n",
        "\n",
        "    latency  \n",
-       "0  2.814149  \n",
-       "0  1.684336  \n",
-       "0  5.495382  "
+       "0  2.887883  \n",
+       "0  2.421593  \n",
+       "0  9.665427  "
       ]
      },
-     "execution_count": 15,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }