Skip to content

Commit

Permalink
Adds script to run experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewcoole committed Nov 21, 2024
1 parent 0592fcd commit 557363d
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,4 @@ cython_debug/
metrics.txt
metrics.png
gdrive-oauth.txt
/eval
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@
/supporting-docs.json
/metrics.json
/eval.png
/eidc_rag_testset.csv
8 changes: 4 additions & 4 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
metrics:
- data/metrics.json
stages:
fetch-metadata:
cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample}
Expand All @@ -20,7 +22,7 @@ stages:
outs:
- ${files.extracted}
chunk-data:
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs}
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs} -m ${max-length}
deps:
- ${files.extracted}
- ${files.supporting-docs}
Expand All @@ -42,7 +44,7 @@ stages:
outs:
- ${files.doc-store}
generate-testset:
cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
cmd: head -n 2 data/synthetic-datasets/eidc_rag_test_sample.csv > ${files.test-set}
outs:
- ${files.test-set}
run-rag-pipeline:
Expand All @@ -61,5 +63,3 @@ stages:
outs:
- ${files.metrics}
- ${files.eval-plot}
metrics:
- ${files.metrics}
5 changes: 3 additions & 2 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ files:
chunked: data/chunked_data.json
embeddings: data/embeddings.json
doc-store: data/chroma-data
test-set: data/eidc_rag_test_sample.csv
test-set: data/eidc_rag_testset.csv
eval-set: data/evaluation_data.csv
metrics: data/metrics.json
eval-plot: data/eval.png
sub-sample: 3 # sample size of 0 will process all data
sub-sample: 0 # sample n datasets for testing (0 will use all datasets)
max-length: 0 # truncate longer texts for testing (0 will use all data)
rag:
model: llama3.1
prompt: >-
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ dependencies = [
"ragas == 0.1.10",
"nltk == 3.9.1",
"nbformat == 4.2.0",
"pygit2 == 1.14.1",
]

[project.optional-dependencies]
Expand Down
7 changes: 7 additions & 0 deletions run-experiments.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
dvc queue remove --all
dvc exp run --queue -S hp.chunk-size=400 -S sub-sample=1 -S max-length=500
dvc exp run --queue -S hp.chunk-size=600 -S sub-sample=1 -S max-length=500
dvc queue start
dvc queue status
echo "Run `dvc queue status` to check the state of the experiments"
28 changes: 19 additions & 9 deletions scripts/chunk_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,20 @@
from typing import Any, Dict, List


def chunk_value(value: str, chunk_size: int, overlap: int) -> List[str]:
def chunk_value(value: str, chunk_size: int, overlap: int, max_length: int) -> List[str]:
chunks = []
start = 0
while start < len(value):
end = max_length if len(value) > max_length > 0 else len(value)
while start < end:
chunks.append(value[start : (start + chunk_size)])
start += chunk_size - overlap
return chunks


def chunk_metadata_value(
metada_value: str, chunk_size: int, overlap: int
metada_value: str, chunk_size: int, overlap: int, max_length: int
) -> List[Dict[str, Any]]:
chunks = chunk_value(metada_value["value"], chunk_size, overlap)
chunks = chunk_value(metada_value["value"], chunk_size, overlap, max_length)
return [
{
"chunk": chunks[i],
Expand All @@ -28,20 +29,20 @@ def chunk_metadata_value(


def chunk_metadata_file(
file: str, chunk_size: int, overlap: int
file: str, chunk_size: int, overlap: int, max_length: int
) -> List[Dict[str, str]]:
chunked_metadata = []
with open(file) as f:
json_data = json.load(f)
for metadata in json_data:
chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap))
chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap, max_length))
return chunked_metadata


def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None:
def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int, max_length: int) -> None:
all_chunked_metadata = []
for file in files:
all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap))
all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap, max_length))
with open(ouput_file, "w") as f:
json.dump(all_chunked_metadata, f, indent=4)

Expand Down Expand Up @@ -73,6 +74,15 @@ def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> No
nargs="?",
const=100,
)
parser.add_argument(
"-m",
"--max_length",
help="""Maximum length of data in characters - meant for truncating large
strings in testing. 0 defaults to all data""",
type=int,
nargs="?",
const=0,
)
args = parser.parse_args()
assert args.chunk > args.overlap
main(args.input_files, args.output, args.chunk, args.overlap)
main(args.input_files, args.output, args.chunk, args.overlap, args.max_length)
5 changes: 4 additions & 1 deletion scripts/create_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from argparse import ArgumentParser

import gc
import torch
from sentence_transformers import SentenceTransformer
from torch import Tensor
from tqdm import tqdm
Expand All @@ -16,6 +17,8 @@ def main(input_file: str, output_file: str) -> None:
data = json.load(input)
for chunk in tqdm(data):
chunk["embedding"] = create_embedding(chunk["chunk"]).tolist()
gc.collect()
torch.cuda.empty_cache()
json.dump(data, output)


Expand Down
12 changes: 8 additions & 4 deletions scripts/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from argparse import ArgumentParser
from pathlib import Path

import nest_asyncio
import pandas as pd
Expand Down Expand Up @@ -44,17 +45,20 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
run_config=RunConfig(max_workers=1),
)
result_df = result.to_pandas()
pio.templates.default = "gridon"
fig = go.Figure()

with open(metric_output, "w") as f:
Path(metric_output).parent.mkdir(parents=True, exist_ok=True)
with open(metric_output, "w+") as f:
json.dump(result, f)
metrics = [
metric
for metric in result_df.columns.to_list()
if metric not in ["question", "ground_truth", "answer", "contexts"]
]


pio.templates.default = "gridon"
fig = go.Figure()

for metric in metrics:
fig.add_trace(
go.Violin(
Expand All @@ -66,7 +70,7 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None:
)
)
fig.update_yaxes(range=[-0.02, 1.02])
with open(image_output, "wb") as f:
with open(image_output, "wb+") as f:
f.write(fig.to_image(format="png"))


Expand Down

0 comments on commit 557363d

Please sign in to comment.