diff --git a/data/.gitignore b/data/.gitignore index aa990e4..af08d39 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -17,4 +17,5 @@ /eidc_rag_test_set.csv /rag-pipeline.yml /pipeline.yml +/results.csv /cleaned_testset.csv diff --git a/dvc.yaml b/dvc.yaml index d797449..24a4bab 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -86,10 +86,11 @@ stages: - ${files.eval-set} - ${files.pipeline} evaluate: - cmd: uv run scripts/evaluate.py ${files.eval-set} -m ${files.metrics} -img ${files.eval-plot} + cmd: uv run scripts/evaluate.py ${files.eval-set} -m ${files.metrics} -img ${files.eval-plot} -r ${files.results} deps: - ${files.eval-set} - scripts/evaluate.py outs: - ${files.metrics} - ${files.eval-plot} + - ${files.results} diff --git a/params.yaml b/params.yaml index f7585bb..cd46cb4 100644 --- a/params.yaml +++ b/params.yaml @@ -16,6 +16,7 @@ files: cleaned-test-set: data/cleaned_testset.csv eval-set: data/evaluation_data.csv metrics: data/metrics.json + results: data/results.csv eval-plot: data/eval.png pipeline: data/pipeline.yml sub-sample: 0 # sample n datasets for testing (0 will use all datasets) diff --git a/scripts/evaluate.py b/scripts/evaluate.py index fbe348f..b4ea981 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -22,7 +22,7 @@ from ragas.run_config import RunConfig -def main(eval_dataset: str, metric_output: str, image_output: str) -> None: +def main(eval_dataset: str, metric_output: str, image_output: str, results_output: str) -> None: nest_asyncio.apply() # apply the event loop async fix df = pd.read_csv(eval_dataset, converters={"contexts": pd.eval}) eval_dataset = Dataset.from_pandas(df) @@ -45,7 +45,7 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: run_config=RunConfig(max_workers=1), ) result_df = result.to_pandas() - + result_df.to_csv(results_output, index=False) Path(metric_output).parent.mkdir(parents=True, exist_ok=True) with open(metric_output, "w+") as f: json.dump(result, f) @@ -88,5 +88,11 @@ def main(eval_dataset: str, metric_output: str, image_output: str) -> None: help="File to save image plot to.", default="data/evaluation.png", ) + parser.add_argument( + "-r", + "--results", + help="File to save evaluation results", + default="data/results.csv", + ) args = parser.parse_args() - main(args.eval_dataset, args.metrics_output, args.image_output) + main(args.eval_dataset, args.metrics_output, args.image_output, args.results) diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py index 7a142f2..ba282d6 100644 --- a/scripts/run_rag_pipeline.py +++ b/scripts/run_rag_pipeline.py @@ -11,6 +11,7 @@ from haystack_integrations.components.generators.ollama.generator import OllamaGenerator from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever from haystack_integrations.document_stores.chroma import ChromaDocumentStore +from tqdm import tqdm TMP_DOC_PATH = ".tmp/doc-store" @@ -93,12 +94,17 @@ def run_query(query: str, pipeline: Pipeline) -> Dict[str, Any]: def query_pipeline(questions: List[str], rag_pipe: Pipeline) -> Tuple[str, List[str]]: answers = [] contexts = [] - for q in questions: - response = run_query(q, rag_pipe) - answers.append(response["answer_builder"]["answers"][0].data) - contexts.append( - [doc.content for doc in response["answer_builder"]["answers"][0].documents] - ) + for q in tqdm(questions): + try: + response = run_query(q, rag_pipe) + answers.append(response["answer_builder"]["answers"][0].data) + contexts.append( + [doc.content for doc in response["answer_builder"]["answers"][0].documents] + ) + except Exception as e: + print(str(e)) + answers.append("Error") + contexts.append([]) return answers, contexts