diff --git a/data/.gitignore b/data/.gitignore index 439d928..af08d39 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -18,3 +18,4 @@ /rag-pipeline.yml /pipeline.yml /results.csv +/cleaned_testset.csv diff --git a/dvc.yaml b/dvc.yaml index 31f1704..24a4bab 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -37,7 +37,7 @@ stages: outs: - ${files.chunked} create-embeddings: - cmd: uv run scripts/create_embeddings.py ${files.chunked} ${files.embeddings} -m ${hp.embeddings-model} + cmd: uv run scripts/create_embeddings.py ${files.chunked} ${files.embeddings} -m ${hp.embeddings-model} -u deps: - ${files.chunked} - scripts/create_embeddings.py @@ -62,17 +62,24 @@ stages: - scripts/generate_synthetic_testset.py outs: - ${files.test-set} + evaluate-synthetic-testset: + cmd: uv run scripts/evaluate_synthetic_testset.py ${files.test-set} ${files.cleaned-test-set} + deps: + - ${files.test-set} + - scripts/evaluate_synthetic_testset.py + outs: + - ${files.cleaned-test-set} run-rag-pipeline: cmd: >- uv run scripts/run_rag_pipeline.py - -i ${files.test-set} + -i ${files.cleaned-test-set} -o ${files.eval-set} -ds ${files.doc-store} -c ${doc-store.collection} -m ${rag.model} -p ${files.pipeline} deps: - - ${files.test-set} + - ${files.cleaned-test-set} - ${files.doc-store} - scripts/run_rag_pipeline.py outs: diff --git a/params.yaml b/params.yaml index a387f96..cd46cb4 100644 --- a/params.yaml +++ b/params.yaml @@ -13,6 +13,7 @@ files: embeddings: data/embeddings.json doc-store: data/chroma-data test-set: data/eidc_rag_testset.csv + cleaned-test-set: data/cleaned_testset.csv eval-set: data/evaluation_data.csv metrics: data/metrics.json results: data/results.csv diff --git a/prompts/synth-eval.txt b/prompts/synth-eval.txt new file mode 100644 index 0000000..4d81772 --- /dev/null +++ b/prompts/synth-eval.txt @@ -0,0 +1,58 @@ +You are helpful assistant. +Your task is to assess whether a given question is appropriate for use in evaluating a retrieval augmented generative system. +The system being evaluated is designed to answers questions about environmental science datasets stored in a data centre called the Environmental Information Data Centre (EIDC). +The data cetnre stores metadata information about datasets and this is the information the system uses to answer the questions. +The metadata includes information about the dataset, such as the title, description, keywords, and supporting documentation that may provide limited descrioption of the data collection methods and data processing steps. +The criteria to assess the questions on are: +1. Is the question clear? +2. If the question appears to be specific to one dataset, is it clear to what dataset it is referring? +3. If the question is more general, is it clear what type of information is being requested? + +You must also assess whether or a pre-generated "ground truth" answer is reasonable. In some cases, no appropriate answer will have been generated because the question is not answerable based on the provided context, in which case the question should be marked as not appropriate. + +The question (along with the ground truth) will be provided in a JSON format. +Examples: +{ + "question": "What specific parameters are recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset?", + "ground_truth": "The specific parameters recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset are seedlings, diameter at breast height (dbh), height and species dominance." +} +{ + "question": "What are the specific types of structures and features included in the GIS shapefiles for the 'Building, infrastructure, and river channel reaches' dataset related to the Chamoli event?", + "ground_truth": "The GIS shapefiles include information about bridges, buildings, roads, and river valleys." +} + +Your response should be in the form of a JSON object containing the question and a boolean value indicating whether the question is appropriate or not. +The object should also include a reason code for the assessment. + +Examples: +{ + "question": "What was the average weed abundance across the surveyed lowland arable fields over the three-year period?", + "ground_truth": "The answer to given question is not present in context" + "appropriate": false, + "reason": "The question appears to be reffering to a specific dataset but it is not clear which one. The ground_truth does not give an appropriate answer." +} +{ + "question": "What specific parameters are recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset?", + "ground_truth": "The specific parameters recorded for each tree within the surveyed 10m x 10m plots in the 'UK Environmental Change Network (ECN) woodland vegetation data' dataset are seedlings, diameter at breast height (dbh), height and species dominance.", + "appropriate": true, + "reason": "The question is clear and specific to a dataset and the ground truth provides a reasonable answer." +} +{ + "question": "What are the specific types of structures and features included in the GIS shapefiles for the 'Building, infrastructure, and river channel reaches' dataset related to the Chamoli event?", + "ground_truth": "The GIS shapefiles include information about bridges, buildings, roads, and river valleys.", + "appropriate": true, + "reason": "The question is asking for specific information from a particular dataset and the ground_thruth provides a reasonable answer." +} +{ + "question": "What were the earliest recorded first egg dates for blue tits (Cyanistes caeruleus) across the three woods between 1993 and 2014?", + "appropriate": false, + "reason": "The question appears to refer to a specific dataset by referencing three woods, but it is not clear which dataset." +} +{ + "question": "What are the estimated annual loads of nitrogen from non-agricultural sources to rivers in Scotland?", + "ground_truth": "The answer to given question is not present in context", + "appropriate": false, + "reason": "The ground_truth does not provide an answer to the question." +} + +The question to assess is: diff --git a/run-experiments.sh b/run-experiments.sh index 83849a2..ed8f77f 100755 --- a/run-experiments.sh +++ b/run-experiments.sh @@ -1,12 +1,12 @@ #!/bin/bash NC='\033[0m' GREEN='\033[0;32m' -dvc queue remove --all +uv run dvc queue remove --all models=("llama3 llama3.1 mistral-nemo") for model in $models do - dvc exp run --queue -S rag.model=$model + uv run dvc exp run --queue -S rag.model=$model -S sub-sample=1 -S max-length=250 -S test-set-size=5 done -dvc queue start -dvc queue status -echo -e "Run ${GREEN}dvc queue status${NC} to check the state of the experiments" +uv run dvc queue start +uv run dvc queue status +echo -e "Use ${GREEN}uv run dvc queue status${NC} to check the state of the experiments" diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py index d2e70d6..5346f54 100644 --- a/scripts/chunk_data.py +++ b/scripts/chunk_data.py @@ -25,6 +25,7 @@ def chunk_metadata_value( "field": metada_value["field"], "id": metada_value["id"], "index": i, + "dataset": metada_value["dataset"], } for i in range(len(chunks)) ] diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py index 9df7d3e..0c10a64 100644 --- a/scripts/create_embeddings.py +++ b/scripts/create_embeddings.py @@ -18,14 +18,25 @@ def batched(iterable, n, *, strict=False): yield batch -def main(input_file: str, output_file: str, model_name: str) -> None: +def create_unified_text_to_embed(batch: list) -> list: + return [f"Metadata: Dataset: {chunk['dataset']}\nText: {chunk['chunk']}" for chunk in batch] + + +def create_texts_to_embed(use_unified_embeddings: bool, batch: list) -> list: + if use_unified_embeddings: + return create_unified_text_to_embed(batch) + else: + return [chunk["chunk"] for chunk in batch] + + +def main(input_file: str, output_file: str, model_name: str, use_unified_embeddings: bool) -> None: model = SentenceTransformer(model_name) with open(input_file) as input, open(output_file, "w") as output: data = json.load(input) batches = list(batched(data, 500)) position = 0 for batch in tqdm(batches): - texts = [chunk["chunk"] for chunk in batch] + texts = create_texts_to_embed(use_unified_embeddings, batch) embeddings = model.encode(texts) for embedding in embeddings: data[position]["embedding"] = embedding.tolist() @@ -42,5 +53,6 @@ def main(input_file: str, output_file: str, model_name: str) -> None: parser.add_argument( "-m", "--model", help="Embedding model to use.", default="all-MiniLM-L6-v2" ) + parser.add_argument("-u", "--unified-embeddings", help="Use unified embeddings.", action="store_true") args = parser.parse_args() - main(args.input, args.output, args.model) + main(args.input, args.output, args.model, args.unified_embeddings) diff --git a/scripts/evaluate_synthetic_testset.py b/scripts/evaluate_synthetic_testset.py new file mode 100644 index 0000000..75fba2f --- /dev/null +++ b/scripts/evaluate_synthetic_testset.py @@ -0,0 +1,35 @@ +import json +from argparse import ArgumentParser +from json import JSONDecodeError +from pathlib import Path + +import ollama +import pandas as pd +from tqdm import tqdm + + +def main(input: str, output: str, model: str, prompt_file: str) -> None: + df = pd.read_csv(input) + prompt = Path(prompt_file).read_text() + df["appropriate"] = False + df["reason"] = "" + for i, row in tqdm(df.iterrows(), total=len(df)): + json_q = json.dumps({"question": row["question"], "ground_truth": row["ground_truth"]}, indent=4) + response = ollama.generate(model=model, prompt=prompt + json_q) + try: + result = json.loads(response["response"]) + df.loc[i, "appropriate"] = result["appropriate"] + df.loc[i, "reason"] = result["reason"] + except JSONDecodeError: + df.loc[i, "reason"] = "Error decoding response" + df.to_csv(output, index=False) + + +if __name__ == "__main__": + parser = ArgumentParser("evaluate_synthetic_data.py") + parser.add_argument("eval_dataset", help="File containing the synthetic questions.") + parser.add_argument("output", help="File to output the evaluated synthetic data.") + parser.add_argument("-m", "--model", help="The model to use for evaluation.", default="mistral-nemo") + parser.add_argument("-p", "--prompt-file", help="File containing the prompt to use for evaluation", default="prompts/synth-eval.txt") + args = parser.parse_args() + main(args.eval_dataset, args.output, args.model, args.prompt_file) diff --git a/scripts/extract_metadata.py b/scripts/extract_metadata.py index 9bd4c3c..7f374b5 100644 --- a/scripts/extract_metadata.py +++ b/scripts/extract_metadata.py @@ -13,6 +13,7 @@ def extact_eidc_metadata_fields( if json_data[field]: metadata = {} metadata["id"] = json_data["identifier"] + metadata["dataset"] = json_data["title"] metadata["field"] = field metadata["value"] = json_data[field] metadatas.append(metadata) diff --git a/scripts/fetch_supporting_docs.py b/scripts/fetch_supporting_docs.py index d95493b..5be745c 100644 --- a/scripts/fetch_supporting_docs.py +++ b/scripts/fetch_supporting_docs.py @@ -11,14 +11,15 @@ logger = logging.getLogger(__name__) -def extract_ids(metadata_file: str) -> List[str]: +def extract_ids_and_titles(metadata_file: str) -> List[str]: with open(metadata_file) as f: json_data = json.load(f) + titles = [dataset["title"] for dataset in json_data["results"]] ids = [dataset["identifier"] for dataset in json_data["results"]] - return ids + return list(zip(titles, ids)) -def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str, str]]: +def get_supporting_docs(datset_title: str, eidc_id: str, user: str, password: str) -> List[Dict[str, str]]: try: res = requests.get( f"https://legilo.eds-infra.ceh.ac.uk/{eidc_id}/documents", @@ -27,7 +28,7 @@ def get_supporting_docs(eidc_id: str, user: str, password: str) -> List[Dict[str json_data = res.json() docs = [] for key, val in json_data["success"].items(): - docs.append({"id": eidc_id, "field": key, "value": val}) + docs.append({"dataset": datset_title, "id": eidc_id, "field": key, "value": val}) return docs except Exception as e: logger.error( @@ -40,10 +41,10 @@ def main(metadata_file: str, supporting_docs_file: str) -> None: load_dotenv() user = os.getenv("username") password = os.getenv("password") - ids = extract_ids(metadata_file) + ids_and_titles = extract_ids_and_titles(metadata_file) docs = [] - for id in tqdm(ids): - docs.extend(get_supporting_docs(id, user, password)) + for id_title in tqdm(ids_and_titles): + docs.extend(get_supporting_docs(id_title[0], id_title[1], user, password)) with open(supporting_docs_file, "w") as f: json.dump(docs, f, indent=4) diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index a37e043..ba282d6 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -126,8 +126,9 @@ def main( rag_pipe.dump(f) df = pd.read_csv(test_data_file) + df = df[df['appropriate'] == True] df.drop( - columns=["contexts", "evolution_type", "metadata", "episode_done"], + columns=["contexts", "evolution_type", "metadata", "episode_done", "appropriate", "reason"], inplace=True, )