From 679a2d25e94253b78a04c3a0e15f5289c1762c9f Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Thu, 19 Dec 2024 16:21:57 +0000
Subject: [PATCH 1/5] adds llm as a judge evaluation of synthetic testset

---
 data/.gitignore                       |  1 +
 dvc.yaml                              | 11 ++++-
 params.yaml                           |  1 +
 prompts/synth-eval.txt                | 58 +++++++++++++++++++++++++++
 scripts/evaluate_synthetic_testset.py | 35 ++++++++++++++++
 scripts/run_rag_pipeline.py           |  3 +-
 6 files changed, 106 insertions(+), 3 deletions(-)
 create mode 100644 prompts/synth-eval.txt
 create mode 100644 scripts/evaluate_synthetic_testset.py

diff --git a/data/.gitignore b/data/.gitignore
index b8dc417..aa990e4 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -17,3 +17,4 @@
 /eidc_rag_test_set.csv
 /rag-pipeline.yml
 /pipeline.yml
+/cleaned_testset.csv
diff --git a/dvc.yaml b/dvc.yaml
index 179c315..1880ef2 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -62,17 +62,24 @@ stages:
     - scripts/generate_synthetic_testset.py
     outs: 
     - ${files.test-set}
+  evaluate-synthetic-testset:
+    cmd: uv run scripts/evaluate_synthetic_testset.py ${files.test-set} ${files.cleaned-test-set}
+    deps:
+    - ${files.test-set}
+    - scripts/evaluate_synthetic_testset.py
+    outs:
+    - ${files.cleaned-test-set}
   run-rag-pipeline:
     cmd: >-
       uv run scripts/run_rag_pipeline.py 
-      -i ${files.test-set} 
+      -i ${files.cleaned-test-set}
       -o ${files.eval-set} 
       -ds ${files.doc-store} 
       -c ${doc-store.collection} 
       -m ${rag.model} 
       -p ${files.pipeline}
     deps:
-    - ${files.test-set}
+    - ${files.cleaned-test-set}
     - ${files.doc-store}
     - scripts/run_rag_pipeline.py
     outs: 
diff --git a/params.yaml b/params.yaml
index b3dc785..f7585bb 100644
--- a/params.yaml
+++ b/params.yaml
@@ -13,6 +13,7 @@ files:
   embeddings: data/embeddings.json
   doc-store: data/chroma-data
   test-set: data/eidc_rag_testset.csv
+  cleaned-test-set: data/cleaned_testset.csv
   eval-set: data/evaluation_data.csv
   metrics: data/metrics.json
   eval-plot: data/eval.png
diff --git a/prompts/synth-eval.txt b/prompts/synth-eval.txt
new file mode 100644
index 0000000..0b449b6
--- /dev/null
+++ b/prompts/synth-eval.txt
@@ -0,0 +1,58 @@
+You are helpful assistant. 
+Your task is to assess whether a given question is appropriate for use in evaluating a retrieval augmented generative system.
+The system being evaluated is designed to answers questions about environmental science datasets stored in a data centre called the Environmental Information Data Centre (EIDC).
+The data cetnre stores metadata information about datasets and this is the information the system uses to answer the questions. 
+The metadata includes information about the dataset, such as the title, description, keywords, and supporting documentation that may provide limited descrioption of the data collection methods and data processing steps.
+The criteria to assess the questions on are:
+1. Is the question clear?
+2. If the question appears to be specific to one dataset, is it clear to what dataset it is referring?
+3. If the question is more general, is it clear what type of information is being requested?
+
+You must also assess whether or a pre-generated "ground truth" answer is reasonable. In some cases, no appropriate answer will have been generated because the question is not answerable based on the provided context, in which case the question should be marked as not appropriate.
+
+The question (along with the ground truth) will be provided in a JSON format.
+Examples:
+{
+    "question": "What specific parameters are recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset?",
+    "ground_truth": "The specific parameters recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset are seedlings, diameter at breast height (dbh), height and species dominance."
+}
+{
+    "question": "What are the specific types of structures and features included in the GIS shapefiles for the 'Building, infrastructure, and river channel reaches' dataset related to the Chamoli event?",
+    "ground_truth": "The GIS shapefiles include information about bridges, buildings, roads, and river valleys."
+}
+
+Your response should be in the form of a JSON object containing the question and a boolean value indicating whether the question is appropriate or not.
+The object should also include a reason code for the assessment.
+
+Examples:
+{
+    "question": "What was the average weed abundance across the surveyed lowland arable fields over the three-year period?",
+    "ground_truth": "The answer to given question is not present in context"
+    "appropriate": false,
+    "reason": "The question appears to be reffering to a specific dataset but it is not clear which one. The ground truth does not give an appropriate answer and the "
+}
+{
+    "question": "What specific parameters are recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset?",
+    "ground_truth": "The specific parameters recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset are seedlings, diameter at breast height (dbh), height and species dominance.",
+    "appropriate": true,
+    "reason": "The question is clear and specific to a dataset and the ground truth provides a reasonable answer."
+}
+{
+    "question": "What are the specific types of structures and features included in the GIS shapefiles for the 'Building, infrastructure, and river channel reaches' dataset related to the Chamoli event?",
+    "ground_truth": "The GIS shapefiles include information about bridges, buildings, roads, and river valleys.",
+    "appropriate": true,
+    "reason": "The question is asking for specific information from a particular dataset and the ground_thruth provides a reasonable answer."
+}
+{
+    "question": "What were the earliest recorded first egg dates for blue tits (Cyanistes caeruleus) across the three woods between 1993 and 2014?",
+    "appropriate": false,
+    "reason": "The question appears to refer to a specific dataset by referencing three woods, but it is not clear which dataset."
+}
+{
+    "question": "What are the estimated annual loads of nitrogen from non-agricultural sources to rivers in Scotland?",
+    "ground_truth": "The answer to given question is not present in context",
+    "appropriate": false,
+    "reason": "The ground truth does not provide an answer to the question."
+}
+
+The question to assess is:
diff --git a/scripts/evaluate_synthetic_testset.py b/scripts/evaluate_synthetic_testset.py
new file mode 100644
index 0000000..75fba2f
--- /dev/null
+++ b/scripts/evaluate_synthetic_testset.py
@@ -0,0 +1,35 @@
+import json
+from argparse import ArgumentParser
+from json import JSONDecodeError
+from pathlib import Path
+
+import ollama
+import pandas as pd
+from tqdm import tqdm
+
+
+def main(input: str, output: str, model: str, prompt_file: str) -> None:
+    df = pd.read_csv(input)
+    prompt = Path(prompt_file).read_text()
+    df["appropriate"] = False
+    df["reason"] = ""
+    for i, row in tqdm(df.iterrows(), total=len(df)):
+        json_q = json.dumps({"question": row["question"], "ground_truth": row["ground_truth"]}, indent=4)
+        response = ollama.generate(model=model, prompt=prompt + json_q)
+        try:
+            result = json.loads(response["response"])
+            df.loc[i, "appropriate"] = result["appropriate"]
+            df.loc[i, "reason"] = result["reason"]
+        except JSONDecodeError:
+            df.loc[i, "reason"] = "Error decoding response"
+    df.to_csv(output, index=False)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("evaluate_synthetic_data.py")
+    parser.add_argument("eval_dataset", help="File containing the synthetic questions.")
+    parser.add_argument("output", help="File to output the evaluated synthetic data.")
+    parser.add_argument("-m", "--model", help="The model to use for evaluation.", default="mistral-nemo")
+    parser.add_argument("-p", "--prompt-file", help="File containing the prompt to use for evaluation", default="prompts/synth-eval.txt")
+    args = parser.parse_args()
+    main(args.eval_dataset, args.output, args.model, args.prompt_file)
diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
index ca0318b..7a142f2 100644
--- a/scripts/run_rag_pipeline.py
+++ b/scripts/run_rag_pipeline.py
@@ -120,8 +120,9 @@ def main(
         rag_pipe.dump(f)
 
     df = pd.read_csv(test_data_file)
+    df = df[df['appropriate'] == True]
     df.drop(
-        columns=["contexts", "evolution_type", "metadata", "episode_done"],
+        columns=["contexts", "evolution_type", "metadata", "episode_done", "appropriate", "reason"],
         inplace=True,
     )
 

From 2b9c1b4cbe3ae5db9c4d6c41c03755e4b99e0f51 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 20 Dec 2024 08:43:03 +0000
Subject: [PATCH 2/5] cleans synth eval prompt

---
 prompts/synth-eval.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prompts/synth-eval.txt b/prompts/synth-eval.txt
index 0b449b6..4d81772 100644
--- a/prompts/synth-eval.txt
+++ b/prompts/synth-eval.txt
@@ -29,7 +29,7 @@ Examples:
     "question": "What was the average weed abundance across the surveyed lowland arable fields over the three-year period?",
     "ground_truth": "The answer to given question is not present in context"
     "appropriate": false,
-    "reason": "The question appears to be reffering to a specific dataset but it is not clear which one. The ground truth does not give an appropriate answer and the "
+    "reason": "The question appears to be reffering to a specific dataset but it is not clear which one. The ground_truth does not give an appropriate answer."
 }
 {
     "question": "What specific parameters are recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset?",
@@ -52,7 +52,7 @@ Examples:
     "question": "What are the estimated annual loads of nitrogen from non-agricultural sources to rivers in Scotland?",
     "ground_truth": "The answer to given question is not present in context",
     "appropriate": false,
-    "reason": "The ground truth does not provide an answer to the question."
+    "reason": "The ground_truth does not provide an answer to the question."
 }
 
 The question to assess is:

From e2f8fa877e6b4314b2ff577e4073f80cce1342da Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 20 Dec 2024 09:25:34 +0000
Subject: [PATCH 3/5] adds uv support in experiments script

---
 run-experiments.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/run-experiments.sh b/run-experiments.sh
index 83849a2..ed8f77f 100755
--- a/run-experiments.sh
+++ b/run-experiments.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 NC='\033[0m'
 GREEN='\033[0;32m'
-dvc queue remove --all
+uv run dvc queue remove --all
 models=("llama3 llama3.1 mistral-nemo")
 for model in $models
 do
-    dvc exp run --queue -S rag.model=$model
+    uv run dvc exp run --queue -S rag.model=$model -S sub-sample=1 -S max-length=250 -S test-set-size=5
 done
-dvc queue start
-dvc queue status
-echo -e "Run ${GREEN}dvc queue status${NC} to check the state of the experiments"
+uv run dvc queue start
+uv run dvc queue status
+echo -e "Use ${GREEN}uv run dvc queue status${NC} to check the state of the experiments"

From a1d2aa80ae8dee71ebd5e48988a6d29dbf6ab16a Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 20 Dec 2024 10:12:34 +0000
Subject: [PATCH 4/5] adds unified embeddings using the dataset title to text
 chunks

---
 scripts/chunk_data.py            |  1 +
 scripts/create_embeddings.py     | 18 +++++++++++++++---
 scripts/extract_metadata.py      |  1 +
 scripts/fetch_supporting_docs.py | 15 ++++++++-------
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py
index d2e70d6..5346f54 100644
--- a/scripts/chunk_data.py
+++ b/scripts/chunk_data.py
@@ -25,6 +25,7 @@ def chunk_metadata_value(
             "field": metada_value["field"],
             "id": metada_value["id"],
             "index": i,
+            "dataset": metada_value["dataset"],
         }
         for i in range(len(chunks))
     ]
diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py
index 9df7d3e..0c10a64 100644
--- a/scripts/create_embeddings.py
+++ b/scripts/create_embeddings.py
@@ -18,14 +18,25 @@ def batched(iterable, n, *, strict=False):
         yield batch
 
 
-def main(input_file: str, output_file: str, model_name: str) -> None:
+def create_unified_text_to_embed(batch: list) -> list:
+    return [f"Metadata: Dataset: {chunk['dataset']}\nText: {chunk['chunk']}" for chunk in batch]
+
+
+def create_texts_to_embed(use_unified_embeddings: bool, batch: list) -> list:
+    if use_unified_embeddings:
+        return create_unified_text_to_embed(batch)
+    else:
+        return [chunk["chunk"] for chunk in batch]
+
+
+def main(input_file: str, output_file: str, model_name: str, use_unified_embeddings: bool) -> None:
     model = SentenceTransformer(model_name)
     with open(input_file) as input, open(output_file, "w") as output:
         data = json.load(input)
         batches = list(batched(data, 500))
         position = 0
         for batch in tqdm(batches):
-            texts = [chunk["chunk"] for chunk in batch]
+            texts = create_texts_to_embed(use_unified_embeddings, batch)
             embeddings = model.encode(texts)
             for embedding in embeddings:
                 data[position]["embedding"] = embedding.tolist()
@@ -42,5 +53,6 @@ def main(input_file: str, output_file: str, model_name: str) -> None:
     parser.add_argument(
         "-m", "--model", help="Embedding model to use.", default="all-MiniLM-L6-v2"
     )
+    parser.add_argument("-u", "--unified-embeddings", help="Use unified embeddings.", action="store_true")
     args = parser.parse_args()
-    main(args.input, args.output, args.model)
+    main(args.input, args.output, args.model, args.unified_embeddings)
diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py
index 9bd4c3c..7f374b5 100644
--- a/scripts/extract_metadata.py
+++ b/scripts/extract_metadata.py
@@ -13,6 +13,7 @@ def extact_eidc_metadata_fields(
         if json_data[field]:
             metadata = {}
             metadata["id"] = json_data["identifier"]
+            metadata["dataset"] = json_data["title"]
             metadata["field"] = field
             metadata["value"] = json_data[field]
             metadatas.append(metadata)
diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py
index d95493b..5be745c 100644
--- a/scripts/fetch_supporting_docs.py
+++ b/scripts/fetch_supporting_docs.py
@@ -11,14 +11,15 @@
 logger = logging.getLogger(__name__)
 
 
-def extract_ids(metadata_file: str) -> List[str]:
+def extract_ids_and_titles(metadata_file: str) -> List[str]:
     with open(metadata_file) as f:
         json_data = json.load(f)
+        titles = [dataset["title"] for dataset in json_data["results"]]
         ids = [dataset["identifier"] for dataset in json_data["results"]]
-        return ids
+        return list(zip(titles, ids))
 
 
-def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]:
+def get_supporting_docs(datset_title: str, eidc_id: str, user: str, password: str) -> List[Dict[str, str]]:
     try:
         res = requests.get(
             f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents",
@@ -27,7 +28,7 @@ def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str
         json_data = res.json()
         docs = []
         for key, val in json_data["success"].items():
-            docs.append({"id": eidc_id, "field": key, "value": val})
+            docs.append({"dataset": datset_title, "id": eidc_id, "field": key, "value": val})
         return docs
     except Exception as e:
         logger.error(
@@ -40,10 +41,10 @@ def main(metadata_file: str, supporting_docs_file: str) -> None:
     load_dotenv()
     user = os.getenv("username")
     password = os.getenv("password")
-    ids = extract_ids(metadata_file)
+    ids_and_titles = extract_ids_and_titles(metadata_file)
     docs = []
-    for id in tqdm(ids):
-        docs.extend(get_supporting_docs(id, user, password))
+    for id_title in tqdm(ids_and_titles):
+        docs.extend(get_supporting_docs(id_title[0], id_title[1], user, password))
     with open(supporting_docs_file, "w") as f:
         json.dump(docs, f, indent=4)
 

From dcd49c12fe24689465387e1a9d2ee1bcc1034923 Mon Sep 17 00:00:00 2001
From: mpc <matthewcoole@gmail.com>
Date: Fri, 20 Dec 2024 10:14:10 +0000
Subject: [PATCH 5/5] enables unified embeddings by default in dvc pipeline

---
 dvc.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dvc.yaml b/dvc.yaml
index 1880ef2..d797449 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -37,7 +37,7 @@ stages:
     outs:
     - ${files.chunked}
   create-embeddings:
-    cmd: uv run scripts/create_embeddings.py ${files.chunked} ${files.embeddings} -m ${hp.embeddings-model}
+    cmd: uv run scripts/create_embeddings.py ${files.chunked} ${files.embeddings} -m ${hp.embeddings-model} -u
     deps:
     - ${files.chunked}
     - scripts/create_embeddings.py