Experimentation for RAG evaluation

antoninoLorenzo · Jun 19, 2024 · 8ebf96b · 8ebf96b
1 parent 7c547be
commit 8ebf96b
Show file tree

Hide file tree

Showing 7 changed files with 909 additions and 2 deletions.
diff --git a/data/rag_eval/results/results.json b/data/rag_eval/results/results.json
@@ -0,0 +1,6 @@
+[
+    {
+        "context_precision": 0,
+        "context_recall": 0
+    }
+]
diff --git a/test/benchmarks/rag/__pycache__/__init__.cpython-311.pyc b/test/benchmarks/rag/__pycache__/__init__.cpython-311.pyc
diff --git a/test/benchmarks/rag/__pycache__/evaluation.cpython-311.pyc b/test/benchmarks/rag/__pycache__/evaluation.cpython-311.pyc
diff --git a/test/benchmarks/rag/dataset_generation.ipynb b/test/benchmarks/rag/dataset_generation.ipynb
@@ -474,7 +474,13 @@
   {
    "cell_type": "code",
    "outputs": [],
-   "source": [],
+   "source": [
+    "# TODO: \n",
+    "#   generated dataset should be cleaned from:\n",
+    "#   items with ground_truth == 'the context does not provide sufficient information'\n",
+    "#   items with no groundedness: question can't be answered with given prompt\n",
+    "#   items where question isn't standalone == can't be understood with any given context"
+   ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {

diff --git a/test/benchmarks/rag/evaluation.py b/test/benchmarks/rag/evaluation.py
@@ -50,7 +50,14 @@ def init_knowledge_base(data: dict[str: list[Topic]]) -> Store:
 def evaluate(vdb: Store, synthetic_qa_path: str):
     """Given the Vector Database and the synthetic Q&A dataset
     generated in `dataset_generation.ipynb` runs the evaluation
-    process for the RAG pipeline."""
+    process for the RAG pipeline.
+
+    It consists of:
+
+    - Retrieving the contexts and generating the answers for the questions.
+
+    - Evaluating the full contexts-question-answer-ground_truths dataset.
+    """
     pass
 
 

diff --git a/test/benchmarks/rag/metrics.py b/test/benchmarks/rag/metrics.py
@@ -0,0 +1,57 @@
+"""The metrics used to perform the RAG evaluation"""
+import re
+import json
+import textwrap
+from json import JSONDecodeError
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+
+import requests
+import numpy as np
+
+
+EVAL_PROMPTS = {
+    'mistral': {
+        'context_recall': {
+            'sys': '',
+            'usr': ''
+        },
+        'context_precision': {
+            'sys': '',
+            'usr': ''
+        }
+    }
+}
+
+
+@dataclass
+class HuggingFaceLLM:
+    """Represents HuggingFace Inference Endpoint, it is used for convenience in performance of evaluation."""
+    url: str
+    key: str
+
+    def __post_init__(self):
+        self.headers = {"Authorization": f"Bearer {self.key}", "Content-Type": "application/json"}
+
+    def __query(self, payload):
+        response = requests.post(self.url, headers=self.headers, json={'inputs': payload})
+        response.raise_for_status()
+        return response.json()
+
+    def query(self, messages: list):
+        prompt = '\n'.join([msg['content'] for msg in messages])
+        return self.__query(prompt)
+
+
+@dataclass
+class Metric(ABC):
+    """Represents a RAG evaluation metric using LLM-as-a-judge paradigm"""
+    system_prompt: str
+    user_prompt: str
+    llm_provider: HuggingFaceLLM
+
+    @abstractmethod
+    def compute(self, *args, **kwargs) -> float:
+        """Needs to be implemented to evaluate a metric"""
+        pass
+
diff --git a/test/benchmarks/rag/tmp.ipynb b/test/benchmarks/rag/tmp.ipynb