From 679a2d25e94253b78a04c3a0e15f5289c1762c9f Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Thu, 19 Dec 2024 16:21:57 +0000
Subject: [PATCH 1/2] adds llm as a judge evaluation of synthetic testset

---
 data/.gitignore                       |  1 +
 dvc.yaml                              | 11 ++++-
 params.yaml                           |  1 +
 prompts/synth-eval.txt                | 58 +++++++++++++++++++++++++++
 scripts/evaluate_synthetic_testset.py | 35 ++++++++++++++++
 scripts/run_rag_pipeline.py           |  3 +-
 6 files changed, 106 insertions(+), 3 deletions(-)
 create mode 100644 prompts/synth-eval.txt
 create mode 100644 scripts/evaluate_synthetic_testset.py

diff --git a/data/.gitignore b/data/.gitignore
index b8dc417..aa990e4 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -17,3 +17,4 @@
 /eidc_rag_test_set.csv
 /rag-pipeline.yml
 /pipeline.yml
+/cleaned_testset.csv
diff --git a/dvc.yaml b/dvc.yaml
index 179c315..1880ef2 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -62,17 +62,24 @@ stages:
     - scripts/generate_synthetic_testset.py
     outs: 
     - ${files.test-set}
+  evaluate-synthetic-testset:
+    cmd: uv run scripts/evaluate_synthetic_testset.py ${files.test-set} ${files.cleaned-test-set}
+    deps:
+    - ${files.test-set}
+    - scripts/evaluate_synthetic_testset.py
+    outs:
+    - ${files.cleaned-test-set}
   run-rag-pipeline:
     cmd: >-
       uv run scripts/run_rag_pipeline.py 
-      -i ${files.test-set} 
+      -i ${files.cleaned-test-set}
       -o ${files.eval-set} 
       -ds ${files.doc-store} 
       -c ${doc-store.collection} 
       -m ${rag.model} 
       -p ${files.pipeline}
     deps:
-    - ${files.test-set}
+    - ${files.cleaned-test-set}
     - ${files.doc-store}
     - scripts/run_rag_pipeline.py
     outs: 
diff --git a/params.yaml b/params.yaml
index b3dc785..f7585bb 100644
--- a/params.yaml
+++ b/params.yaml
@@ -13,6 +13,7 @@ files:
   embeddings: data/embeddings.json
   doc-store: data/chroma-data
   test-set: data/eidc_rag_testset.csv
+  cleaned-test-set: data/cleaned_testset.csv
   eval-set: data/evaluation_data.csv
   metrics: data/metrics.json
   eval-plot: data/eval.png
diff --git a/prompts/synth-eval.txt b/prompts/synth-eval.txt
new file mode 100644
index 0000000..0b449b6
--- /dev/null
+++ b/prompts/synth-eval.txt
@@ -0,0 +1,58 @@
+You are helpful assistant. 
+Your task is to assess whether a given question is appropriate for use in evaluating a retrieval augmented generative system.
+The system being evaluated is designed to answers questions about environmental science datasets stored in a data centre called the Environmental Information Data Centre (EIDC).
+The data cetnre stores metadata information about datasets and this is the information the system uses to answer the questions. 
+The metadata includes information about the dataset, such as the title, description, keywords, and supporting documentation that may provide limited descrioption of the data collection methods and data processing steps.
+The criteria to assess the questions on are:
+1. Is the question clear?
+2. If the question appears to be specific to one dataset, is it clear to what dataset it is referring?
+3. If the question is more general, is it clear what type of information is being requested?
+
+You must also assess whether or a pre-generated "ground truth" answer is reasonable. In some cases, no appropriate answer will have been generated because the question is not answerable based on the provided context, in which case the question should be marked as not appropriate.
+
+The question (along with the ground truth) will be provided in a JSON format.
+Examples:
+{
+    "question": "What specific parameters are recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset?",
+    "ground_truth": "The specific parameters recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset are seedlings, diameter at breast height (dbh), height and species dominance."
+}
+{
+    "question": "What are the specific types of structures and features included in the GIS shapefiles for the 'Building, infrastructure, and river channel reaches' dataset related to the Chamoli event?",
+    "ground_truth": "The GIS shapefiles include information about bridges, buildings, roads, and river valleys."
+}
+
+Your response should be in the form of a JSON object containing the question and a boolean value indicating whether the question is appropriate or not.
+The object should also include a reason code for the assessment.
+
+Examples:
+{
+    "question": "What was the average weed abundance across the surveyed lowland arable fields over the three-year period?",
+    "ground_truth": "The answer to given question is not present in context"
+    "appropriate": false,
+    "reason": "The question appears to be reffering to a specific dataset but it is not clear which one. The ground truth does not give an appropriate answer and the "
+}
+{
+    "question": "What specific parameters are recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset?",
+    "ground_truth": "The specific parameters recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset are seedlings, diameter at breast height (dbh), height and species dominance.",
+    "appropriate": true,
+    "reason": "The question is clear and specific to a dataset and the ground truth provides a reasonable answer."
+}
+{
+    "question": "What are the specific types of structures and features included in the GIS shapefiles for the 'Building, infrastructure, and river channel reaches' dataset related to the Chamoli event?",
+    "ground_truth": "The GIS shapefiles include information about bridges, buildings, roads, and river valleys.",
+    "appropriate": true,
+    "reason": "The question is asking for specific information from a particular dataset and the ground_thruth provides a reasonable answer."
+}
+{
+    "question": "What were the earliest recorded first egg dates for blue tits (Cyanistes caeruleus) across the three woods between 1993 and 2014?",
+    "appropriate": false,
+    "reason": "The question appears to refer to a specific dataset by referencing three woods, but it is not clear which dataset."
+}
+{
+    "question": "What are the estimated annual loads of nitrogen from non-agricultural sources to rivers in Scotland?",
+    "ground_truth": "The answer to given question is not present in context",
+    "appropriate": false,
+    "reason": "The ground truth does not provide an answer to the question."
+}
+
+The question to assess is:
diff --git a/scripts/evaluate_synthetic_testset.py b/scripts/evaluate_synthetic_testset.py
new file mode 100644
index 0000000..75fba2f
--- /dev/null
+++ b/scripts/evaluate_synthetic_testset.py
@@ -0,0 +1,35 @@
+import json
+from argparse import ArgumentParser
+from json import JSONDecodeError
+from pathlib import Path
+
+import ollama
+import pandas as pd
+from tqdm import tqdm
+
+
+def main(input: str, output: str, model: str, prompt_file: str) -> None:
+    df = pd.read_csv(input)
+    prompt = Path(prompt_file).read_text()
+    df["appropriate"] = False
+    df["reason"] = ""
+    for i, row in tqdm(df.iterrows(), total=len(df)):
+        json_q = json.dumps({"question": row["question"], "ground_truth": row["ground_truth"]}, indent=4)
+        response = ollama.generate(model=model, prompt=prompt + json_q)
+        try:
+            result = json.loads(response["response"])
+            df.loc[i, "appropriate"] = result["appropriate"]
+            df.loc[i, "reason"] = result["reason"]
+        except JSONDecodeError:
+            df.loc[i, "reason"] = "Error decoding response"
+    df.to_csv(output, index=False)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("evaluate_synthetic_data.py")
+    parser.add_argument("eval_dataset", help="File containing the synthetic questions.")
+    parser.add_argument("output", help="File to output the evaluated synthetic data.")
+    parser.add_argument("-m", "--model", help="The model to use for evaluation.", default="mistral-nemo")
+    parser.add_argument("-p", "--prompt-file", help="File containing the prompt to use for evaluation", default="prompts/synth-eval.txt")
+    args = parser.parse_args()
+    main(args.eval_dataset, args.output, args.model, args.prompt_file)
diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
index ca0318b..7a142f2 100644
--- a/scripts/run_rag_pipeline.py
+++ b/scripts/run_rag_pipeline.py
@@ -120,8 +120,9 @@ def main(
         rag_pipe.dump(f)
 
     df = pd.read_csv(test_data_file)
+    df = df[df['appropriate'] == True]
     df.drop(
-        columns=["contexts", "evolution_type", "metadata", "episode_done"],
+        columns=["contexts", "evolution_type", "metadata", "episode_done", "appropriate", "reason"],
         inplace=True,
     )
 

From 2b9c1b4cbe3ae5db9c4d6c41c03755e4b99e0f51 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 20 Dec 2024 08:43:03 +0000
Subject: [PATCH 2/2] cleans synth eval prompt

---
 prompts/synth-eval.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prompts/synth-eval.txt b/prompts/synth-eval.txt
index 0b449b6..4d81772 100644
--- a/prompts/synth-eval.txt
+++ b/prompts/synth-eval.txt
@@ -29,7 +29,7 @@ Examples:
     "question": "What was the average weed abundance across the surveyed lowland arable fields over the three-year period?",
     "ground_truth": "The answer to given question is not present in context"
     "appropriate": false,
-    "reason": "The question appears to be reffering to a specific dataset but it is not clear which one. The ground truth does not give an appropriate answer and the "
+    "reason": "The question appears to be reffering to a specific dataset but it is not clear which one. The ground_truth does not give an appropriate answer."
 }
 {
     "question": "What specific parameters are recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset?",
@@ -52,7 +52,7 @@ Examples:
     "question": "What are the estimated annual loads of nitrogen from non-agricultural sources to rivers in Scotland?",
     "ground_truth": "The answer to given question is not present in context",
     "appropriate": false,
-    "reason": "The ground truth does not provide an answer to the question."
+    "reason": "The ground_truth does not provide an answer to the question."
 }
 
 The question to assess is: