From 11ee206b2a6a8b83296be7e1a403a8e99c908190 Mon Sep 17 00:00:00 2001 From: mpc Date: Fri, 22 Nov 2024 09:16:18 +0000 Subject: [PATCH] Outputs rag pipeline as a yaml config file --- data/.gitignore | 1 + dvc.yaml | 26 +++++++++++++++++++++++--- scripts/run_rag_pipeline.py | 23 +++++++++++++++++++---- scripts/upload_to_docstore.py | 4 ---- 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/data/.gitignore b/data/.gitignore index 14b46ba..4a16dfd 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -15,3 +15,4 @@ /eval.png /eidc_rag_testset.csv /eidc_rag_test_set.csv +/rag-pipeline.yml diff --git a/dvc.yaml b/dvc.yaml index db54d68..4dbf7e1 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -22,7 +22,14 @@ stages: outs: - ${files.extracted} chunk-data: - cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs} -m ${max-length} + cmd: >- + python scripts/chunk_data.py + -o ${files.chunked} + -c ${hp.chunk-size} + -ol ${hp.overlap} + ${files.extracted} + ${files.supporting-docs} + -m ${max-length} deps: - ${files.extracted} - ${files.supporting-docs} @@ -37,7 +44,12 @@ stages: outs: - ${files.embeddings} upload-to-docstore: - cmd: python scripts/upload_to_docstore.py ${files.embeddings} -o ${doc-store.files} -em ${hp.embeddings-model} -c ${doc-store.collection} + cmd: >- + python scripts/upload_to_docstore.py + ${files.embeddings} + -o ${doc-store.files} + -em ${hp.embeddings-model} + -c ${doc-store.collection} deps: - ${files.embeddings} - scripts/upload_to_docstore.py @@ -48,13 +60,21 @@ stages: outs: - ${files.test-set} run-rag-pipeline: - cmd: python scripts/run_rag_pipeline.py -i ${files.test-set} -o ${files.eval-set} -ds ${files.doc-store} -c ${doc-store.collection} -m ${rag.model} + cmd: >- + python scripts/run_rag_pipeline.py + -i ${files.test-set} + -o ${files.eval-set} + -ds ${files.doc-store} + -c ${doc-store.collection} + -m ${rag.model} + -p ${files.pipeline} deps: - ${files.test-set} - ${files.doc-store} - scripts/run_rag_pipeline.py outs: - ${files.eval-set} + - ${files.pipeline} evaluate: cmd: python scripts/evaluate.py ${files.eval-set} -m ${files.metrics} -img ${files.eval-plot} deps: diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index f4f9ff4..0d823a5 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -1,9 +1,7 @@ import shutil from argparse import ArgumentParser from typing import Any, Dict, List, Tuple -__import__("pysqlite3") -import sys -sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") + import pandas as pd from haystack import Pipeline from haystack.components.builders import PromptBuilder @@ -90,11 +88,15 @@ def main( doc_store_path: str, collection_name: str, model: str, + pipeline_file: str, ) -> None: shutil.copytree(doc_store_path, TMP_DOC_PATH) rag_pipe = build_rag_pipeline(model, collection_name) + with open(pipeline_file, "w") as f: + rag_pipe.dump(f) + df = pd.read_csv(test_data_file) df.drop(columns=["rating", "contexts"], inplace=True) @@ -136,5 +138,18 @@ def main( help="Model to use in RAG pipeline.", default="llama3.1", ) + parser.add_argument( + "-p", + "--pipeline_file", + help="File to save the built RAG pipeline to.", + default="pipeline.yml", + ) args = parser.parse_args() - main(args.input, args.output, args.doc_store, args.collection, args.model) + main( + args.input, + args.output, + args.doc_store, + args.collection, + args.model, + args.pipeline_file, + ) diff --git a/scripts/upload_to_docstore.py b/scripts/upload_to_docstore.py index 860ead0..49741d6 100644 --- a/scripts/upload_to_docstore.py +++ b/scripts/upload_to_docstore.py @@ -4,10 +4,6 @@ import uuid from argparse import ArgumentParser -__import__("pysqlite3") -import sys - -sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") import chromadb from chromadb.utils import embedding_functions from chromadb.utils.batch_utils import create_batches