diff --git a/.gitignore b/.gitignore index bf560c6..03bcc6a 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ cython_debug/ metrics.txt metrics.png gdrive-oauth.txt +/eval diff --git a/data/.gitignore b/data/.gitignore index 09fbf7e..addcca2 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -13,3 +13,4 @@ /supporting-docs.json /metrics.json /eval.png +/eidc_rag_testset.csv diff --git a/dvc.yaml b/dvc.yaml index 2473eec..903cb5a 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,3 +1,5 @@ +metrics: +- data/metrics.json stages: fetch-metadata: cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample} @@ -20,7 +22,7 @@ stages: outs: - ${files.extracted} chunk-data: - cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs} + cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs} -m ${max-length} deps: - ${files.extracted} - ${files.supporting-docs} @@ -42,7 +44,7 @@ stages: outs: - ${files.doc-store} generate-testset: - cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ + cmd: head -n 2 data/synthetic-datasets/eidc_rag_test_sample.csv > ${files.test-set} outs: - ${files.test-set} run-rag-pipeline: @@ -61,5 +63,3 @@ stages: outs: - ${files.metrics} - ${files.eval-plot} -metrics: -- ${files.metrics} \ No newline at end of file diff --git a/params.yaml b/params.yaml index edf0085..bfc46af 100644 --- a/params.yaml +++ b/params.yaml @@ -12,11 +12,12 @@ files: chunked: data/chunked_data.json embeddings: data/embeddings.json doc-store: data/chroma-data - test-set: data/eidc_rag_test_sample.csv + test-set: data/eidc_rag_testset.csv eval-set: data/evaluation_data.csv metrics: data/metrics.json eval-plot: data/eval.png -sub-sample: 3 # sample size of 0 will process all data +sub-sample: 0 # sample n datasets for testing (0 will use all datasets) +max-length: 0 # truncate longer texts for testing (0 will use all data) rag: model: llama3.1 prompt: >- diff --git a/pyproject.toml b/pyproject.toml index 3dda280..a56e3f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "ragas == 0.1.10", "nltk == 3.9.1", "nbformat == 4.2.0", + "pygit2 == 1.14.1", ] [project.optional-dependencies] diff --git a/run-experiments.sh b/run-experiments.sh new file mode 100755 index 0000000..43a8d67 --- /dev/null +++ b/run-experiments.sh @@ -0,0 +1,7 @@ +#!/bin/bash +dvc queue remove --all +dvc exp run --queue -S hp.chunk-size=400 -S sub-sample=1 -S max-length=500 +dvc exp run --queue -S hp.chunk-size=600 -S sub-sample=1 -S max-length=500 +dvc queue start +dvc queue status +echo "Run `dvc queue status` to check the state of the experiments" diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py index 7fe672b..673651d 100644 --- a/scripts/chunk_data.py +++ b/scripts/chunk_data.py @@ -3,19 +3,20 @@ from typing import Any, Dict, List -def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]: +def chunk_value(value: str, chunk_size: int, overlap: int, max_length: int) -> List[str]: chunks = [] start = 0 - while start < len(value): + end = max_length if len(value) > max_length > 0 else len(value) + while start < end: chunks.append(value[start : (start + chunk_size)]) start += chunk_size - overlap return chunks def chunk_metadata_value( - metada_value: str, chunk_size: int, overlap: int + metada_value: str, chunk_size: int, overlap: int, max_length: int ) -> List[Dict[str, Any]]: - chunks = chunk_value(metada_value["value"], chunk_size, overlap) + chunks = chunk_value(metada_value["value"], chunk_size, overlap, max_length) return [ { "chunk": chunks[i], @@ -28,20 +29,20 @@ def chunk_metadata_value( def chunk_metadata_file( - file: str, chunk_size: int, overlap: int + file: str, chunk_size: int, overlap: int, max_length: int ) -> List[Dict[str, str]]: chunked_metadata = [] with open(file) as f: json_data = json.load(f) for metadata in json_data: - chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap)) + chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap, max_length)) return chunked_metadata -def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None: +def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int, max_length: int) -> None: all_chunked_metadata = [] for file in files: - all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap)) + all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap, max_length)) with open(ouput_file, "w") as f: json.dump(all_chunked_metadata, f, indent=4) @@ -73,6 +74,15 @@ def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> No nargs="?", const=100, ) + parser.add_argument( + "-m", + "--max_length", + help="""Maximum length of data in characters - meant for truncating large + strings in testing. 0 defaults to all data""", + type=int, + nargs="?", + const=0, + ) args = parser.parse_args() assert args.chunk > args.overlap - main(args.input_files, args.output, args.chunk, args.overlap) + main(args.input_files, args.output, args.chunk, args.overlap, args.max_length) diff --git a/scripts/create_embeddings.py b/scripts/create_embeddings.py index 7aa507c..220eed0 100644 --- a/scripts/create_embeddings.py +++ b/scripts/create_embeddings.py @@ -1,6 +1,7 @@ import json from argparse import ArgumentParser - +import gc +import torch from sentence_transformers import SentenceTransformer from torch import Tensor from tqdm import tqdm @@ -16,6 +17,8 @@ def main(input_file: str, output_file: str) -> None: data = json.load(input) for chunk in tqdm(data): chunk["embedding"] = create_embedding(chunk["chunk"]).tolist() + gc.collect() + torch.cuda.empty_cache() json.dump(data, output) diff --git a/scripts/evaluate.py b/scripts/evaluate.py index c130e96..fc2ffb8 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -1,5 +1,6 @@ import json from argparse import ArgumentParser +from pathlib import Path import nest_asyncio import pandas as pd @@ -44,10 +45,9 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: run_config=RunConfig(max_workers=1), ) result_df = result.to_pandas() - pio.templates.default = "gridon" - fig = go.Figure() - with open(metric_output, "w") as f: + Path(metric_output).parent.mkdir(parents=True, exist_ok=True) + with open(metric_output, "w+") as f: json.dump(result, f) metrics = [ metric @@ -55,6 +55,10 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: if metric not in ["question", "ground_truth", "answer", "contexts"] ] + + pio.templates.default = "gridon" + fig = go.Figure() + for metric in metrics: fig.add_trace( go.Violin( @@ -66,7 +70,7 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: ) ) fig.update_yaxes(range=[-0.02, 1.02]) - with open(image_output, "wb") as f: + with open(image_output, "wb+") as f: f.write(fig.to_image(format="png"))