Added dummy steps to pipeline

NERC-CEH · Oct 17, 2024 · f9b9b3e · f9b9b3e · github-actions · Oct 17, 2024
1 parent aaa3a93
commit f9b9b3e
Show file tree

Hide file tree

Showing 7 changed files with 101 additions and 24 deletions.
diff --git a/data/.gitignore b/data/.gitignore
@@ -8,3 +8,6 @@
 /chunked_embeddings.json
 /embeddings.json
 /chroma-data
+/evaluation_data.csv
+/eidc_rag_test_sample.csv
+/supporting-docs.json
diff --git a/data/synthetic-datasets.dvc b/data/synthetic-datasets.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 61b4177259b03a7227784b5b7560726d.dir
-  size: 144597
-  nfiles: 1
+- md5: 9d87c638c5cc518ea360c474c4e1e9ef.dir
+  size: 152121
+  nfiles: 2
   hash: md5
   path: synthetic-datasets
diff --git a/dvc.lock b/dvc.lock
@@ -52,6 +52,10 @@ stages:
       hash: md5
       md5: fce18ce3c43175af1cea5d84dac9baf9
       size: 4579965
+    - path: data/supporting-docs.json
+      hash: md5
+      md5: 0febface6f1d23fda46c11bef65284f4
+      size: 34
     - path: scripts/chunk_data.py
       hash: md5
       md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
@@ -95,3 +99,45 @@ stages:
       md5: 2f2ba629bf078284bb6d6be73c6166a7.dir
       size: 2069220
       nfiles: 5
+  run-rag-pipeline:
+    cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
+    deps:
+    - path: data/chroma-data
+      hash: md5
+      md5: 1d7c499f71791267391ff4108632988c.dir
+      size: 2069220
+      nfiles: 5
+    - path: data/eidc_rag_test_sample.csv
+      hash: md5
+      md5: a371d83c5822d256286e80d64d58c3fe
+      size: 7524
+    - path: scripts/run_rag_pipeline.py
+      hash: md5
+      md5: 6d1f49fa8b22288ecd50ed0e3898fd60
+      size: 3153
+    outs:
+    - path: data/evaluation_data.csv
+      hash: md5
+      md5: e313cb899c10a2b5ad670b8bc84d059f
+      size: 8407
+  generate-testset:
+    cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
+    outs:
+    - path: data/eidc_rag_test_sample.csv
+      hash: md5
+      md5: a371d83c5822d256286e80d64d58c3fe
+      size: 7524
+  fetch-supporting-docs:
+    cmd: echo "Fetch supporitng docs from legilo" > data/supporting-docs.json
+    outs:
+    - path: data/supporting-docs.json
+      hash: md5
+      md5: 0febface6f1d23fda46c11bef65284f4
+      size: 34
+  evaluate:
+    cmd: echo "Evaluate responses"
+    deps:
+    - path: data/evaluation_data.csv
+      hash: md5
+      md5: e313cb899c10a2b5ad670b8bc84d059f
+      size: 8407
diff --git a/dvc.yaml b/dvc.yaml
@@ -5,6 +5,10 @@ stages:
     - scripts/fetch_eidc_metadata.py
     outs:
     - ${files.metadata}
+  fetch-supporting-docs:
+    cmd: echo "Fetch supporitng docs from legilo" > ${files.supporting-docs}
+    outs:
+    - ${files.supporting-docs}
   extract-metadata:
     cmd: python scripts/extract_metadata.py ${files.metadata} ${files.extracted}
     deps:
@@ -16,6 +20,7 @@ stages:
     cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted}
     deps:
     - ${files.extracted}
+    - ${files.supporting-docs}
     - scripts/chunk_data.py
     outs:
     - ${files.chunked}
@@ -33,6 +38,19 @@ stages:
     - scripts/upload_to_docstore.py
     outs:
     - ${files.doc-store}
+  generate-testset:
+    cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
+    outs: 
+    - ${files.test-set}
   run-rag-pipeline:
-    cmd python scripts/run_rag_pipeline.py ${files.test-set}
-
+    cmd: python scripts/run_rag_pipeline.py ${files.test-set} ${files.eval-set}
+    deps:
+    - ${files.test-set}
+    - ${files.doc-store}
+    - scripts/run_rag_pipeline.py
+    outs: 
+    - ${files.eval-set}
+  evaluate:
+    cmd: echo "Evaluate responses"
+    deps:
+    - ${files.eval-set}
diff --git a/params.yaml b/params.yaml
@@ -5,10 +5,12 @@ hp:
 files:
   metadata: "data/eidc_metadata.json"
   extracted: "data/extracted_metadata.json"
+  supporting-docs: "data/supporting-docs.json"
   chunked: "data/chunked_data.json"
   embeddings: "data/embeddings.json"
   doc-store: "data/chroma-data"
-  test-set: "data/synthetic-datasets/eidc_rag_test_sample.csv"
+  test-set: "data/eidc_rag_test_sample.csv"
+  eval-set: "data/evaluation_data.csv"
 sample-size: 10 # sample size of 0 will process all data
 rag:
   model: llama3.1

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,8 @@ dependencies = [
     "chromadb",
     "ollama-haystack == 0.0.7",
     "chroma-haystack",
+    "ragas == 0.1.10",
+    "nltk"
 ]
 
 [project.optional-dependencies]

diff --git a/scripts/run_rag_pipeline.py b/scripts/run_rag_pipeline.py
@@ -57,7 +57,7 @@ def build_rag_pipeline(model_name: str) -> Pipeline:
     return rag_pipe
 
 
-def query_pipeline(query: str, pipeline: Pipeline):
+def run_query(query: str, pipeline: Pipeline):
     return pipeline.run(
         {
             "retriever": {"query": query},
@@ -67,25 +67,27 @@ def query_pipeline(query: str, pipeline: Pipeline):
     )
 
 
-def main(test_data_file: str):
+def query_pipeline(questions, rag_pipe):
+    answers = []
+    contexts = []
+    for q in questions:
+        response = run_query(q, rag_pipe)
+        answers.append(response["answer_builder"]["answers"][0].data)
+        contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])
+    return answers, contexts
+
+
+def main(test_data_file: str, ouput_file: str):
     rag_pipe = build_rag_pipeline("llama3.1")
 
     df = pd.read_csv(test_data_file)
-    responses = []
-    for q in df["question"]:
-        responses.append(query_pipeline(q, rag_pipe))
-    df["rag_response"] = responses
-    df.to_csv("data/rag_response.csv")
-
-    query = "Who collected the land cover map data?"
-    result = rag_pipe.run(
-        {
-            "retriever": {"query": query},
-            "prompt_builder": {"query": query},
-            "answer_builder": {"query": query},
-        }
-    )
-    print(result)
+    df.drop(columns=["rating", "contexts"], inplace=True)
+
+    answers, contexts = query_pipeline(df["question"], rag_pipe)
+
+    df["answer"] = answers
+    df["contexts"] = contexts
+    df.to_csv(ouput_file, index=False)
 
 
 if __name__ == "__main__":
@@ -94,5 +96,9 @@ def main(test_data_file: str):
         "test_data_file",
         help="File containing test queries to generate response from the RAG pipeline.",
     )
+    parser.add_argument(
+        "output_file",
+        help="File to output results to.",
+    )
     args = parser.parse_args()
-    main(args.test_data_file)
+    main(args.test_data_file, args.output_file)