Skip to content

Commit

Permalink
Experimentation for RAG evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
antoninoLorenzo committed Jun 19, 2024
1 parent 7c547be commit 8ebf96b
Show file tree
Hide file tree
Showing 7 changed files with 909 additions and 2 deletions.
6 changes: 6 additions & 0 deletions data/rag_eval/results/results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[
{
"context_precision": 0,
"context_recall": 0
}
]
Binary file not shown.
Binary file not shown.
8 changes: 7 additions & 1 deletion test/benchmarks/rag/dataset_generation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,13 @@
{
"cell_type": "code",
"outputs": [],
"source": [],
"source": [
"# TODO: \n",
"# generated dataset should be cleaned from:\n",
"# items with ground_truth == 'the context does not provide sufficient information'\n",
"# items with no groundedness: question can't be answered with given prompt\n",
"# items where question isn't standalone == can't be understood with any given context"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
Expand Down
9 changes: 8 additions & 1 deletion test/benchmarks/rag/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,14 @@ def init_knowledge_base(data: dict[str: list[Topic]]) -> Store:
def evaluate(vdb: Store, synthetic_qa_path: str):
"""Given the Vector Database and the synthetic Q&A dataset
generated in `dataset_generation.ipynb` runs the evaluation
process for the RAG pipeline."""
process for the RAG pipeline.
It consists of:
- Retrieving the contexts and generating the answers for the questions.
- Evaluating the full contexts-question-answer-ground_truths dataset.
"""
pass


Expand Down
57 changes: 57 additions & 0 deletions test/benchmarks/rag/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""The metrics used to perform the RAG evaluation"""
import re
import json
import textwrap
from json import JSONDecodeError
from dataclasses import dataclass
from abc import ABC, abstractmethod

import requests
import numpy as np


EVAL_PROMPTS = {
'mistral': {
'context_recall': {
'sys': '',
'usr': ''
},
'context_precision': {
'sys': '',
'usr': ''
}
}
}


@dataclass
class HuggingFaceLLM:
"""Represents HuggingFace Inference Endpoint, it is used for convenience in performance of evaluation."""
url: str
key: str

def __post_init__(self):
self.headers = {"Authorization": f"Bearer {self.key}", "Content-Type": "application/json"}

def __query(self, payload):
response = requests.post(self.url, headers=self.headers, json={'inputs': payload})
response.raise_for_status()
return response.json()

def query(self, messages: list):
prompt = '\n'.join([msg['content'] for msg in messages])
return self.__query(prompt)


@dataclass
class Metric(ABC):
"""Represents a RAG evaluation metric using LLM-as-a-judge paradigm"""
system_prompt: str
user_prompt: str
llm_provider: HuggingFaceLLM

@abstractmethod
def compute(self, *args, **kwargs) -> float:
"""Needs to be implemented to evaluate a metric"""
pass

831 changes: 831 additions & 0 deletions test/benchmarks/rag/tmp.ipynb

Large diffs are not rendered by default.

0 comments on commit 8ebf96b

Please sign in to comment.