Merge pull request #12 from NERC-CEH/sub-sampling

Added option to sub-sample initial metadata download
NERC-CEH · Nov 15, 2024 · 0592fcd · 0592fcd
2 parents cb9522b + e14b95c
commit 0592fcd
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 81 deletions.
diff --git a/dvc.lock b/dvc.lock
@@ -1,17 +1,17 @@
 schema: '2.0'
 stages:
   fetch-metadata:
-    cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json
+    cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json -s 3
     deps:
     - path: scripts/fetch_eidc_metadata.py
       hash: md5
-      md5: 53d620665448ef91f2deedb517e2f502
-      size: 675
+      md5: a564cb0804b482ef09658f0cb4a0a705
+      size: 941
     outs:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: b4f3774a2921debb4d7740165ac604d4
-      size: 12157676
+      md5: 068ae066ea08ee369c505c8640481cf6
+      size: 125674
   prepare:
     cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
     deps:
@@ -33,94 +33,94 @@ stages:
     deps:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: b4f3774a2921debb4d7740165ac604d4
-      size: 12157676
+      md5: 068ae066ea08ee369c505c8640481cf6
+      size: 125674
     - path: scripts/extract_metadata.py
       hash: md5
-      md5: 3f0269a6413845f4425af55e7cea7bf8
-      size: 1304
+      md5: e66f21369c5106eaaad4476612c6fb5e
+      size: 1313
     outs:
     - path: data/extracted_metadata.json
       hash: md5
-      md5: 789fda7a14f9a85c6ee0e10af8170a95
-      size: 4584498
+      md5: e71f887d993834e3bda1eb00e711e724
+      size: 7005
   chunk-data:
-    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s
-      10 data/extracted_metadata.json data/supporting-docs.json
+    cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
+      data/supporting-docs.json
     deps:
     - path: data/extracted_metadata.json
       hash: md5
-      md5: 789fda7a14f9a85c6ee0e10af8170a95
-      size: 4584498
+      md5: e71f887d993834e3bda1eb00e711e724
+      size: 7005
     - path: data/supporting-docs.json
       hash: md5
-      md5: f3ea9980226e5408497c96a10cc77b80
-      size: 72013526
+      md5: bdab1ea8df4a87aa3d314044eb2eaa0a
+      size: 188762
     - path: scripts/chunk_data.py
       hash: md5
-      md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
-      size: 2509
+      md5: e8de02d6b14c8fc22533d0becfb7d35d
+      size: 2198
     outs:
     - path: data/chunked_data.json
       hash: md5
-      md5: f6426396e1a3564b53649ef5fc0571fd
-      size: 993814
+      md5: a01ff8ed4d429203d6903466d26937ff
+      size: 320740
   create-embeddings:
     cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
     deps:
     - path: data/chunked_data.json
       hash: md5
-      md5: f6426396e1a3564b53649ef5fc0571fd
-      size: 993814
+      md5: a01ff8ed4d429203d6903466d26937ff
+      size: 320740
     - path: scripts/create_embeddings.py
       hash: md5
-      md5: 4649c700dfae922b43b3608ee4f00c1a
-      size: 808
+      md5: d9282fc92ed400855c4fc2a290289f14
+      size: 867
     outs:
     - path: data/embeddings.json
       hash: md5
-      md5: 8fd682131a282736f6a81a6c53040b1e
-      size: 13422675
+      md5: 363e3eaf7f8baddf9aa2e83f45f074b1
+      size: 4345553
   upload-to-docstore:
     cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
       -em all-MiniLM-L6-v2 -c eidc-data
     deps:
     - path: data/embeddings.json
       hash: md5
-      md5: 8fd682131a282736f6a81a6c53040b1e
-      size: 13422675
+      md5: 363e3eaf7f8baddf9aa2e83f45f074b1
+      size: 4345553
     - path: scripts/upload_to_docstore.py
       hash: md5
-      md5: 41da88e3bb6d2592bee938ce347f6983
-      size: 1905
+      md5: 7b9433047ff175d5e6af8d6056caf05b
+      size: 1931
     outs:
     - path: data/chroma-data
       hash: md5
-      md5: 5c99644f30def03f87b37c98341c6f25.dir
-      size: 13758136
-      nfiles: 6
+      md5: 39b81f6d319a02523fbc356dd667b920.dir
+      size: 5702372
+      nfiles: 5
   run-rag-pipeline:
     cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
       data/chroma-data -c eidc-data
     deps:
     - path: data/chroma-data
       hash: md5
-      md5: 5c99644f30def03f87b37c98341c6f25.dir
-      size: 13758136
-      nfiles: 6
+      md5: 39b81f6d319a02523fbc356dd667b920.dir
+      size: 5702372
+      nfiles: 5
     - path: data/eidc_rag_test_sample.csv
       hash: md5
       md5: a371d83c5822d256286e80d64d58c3fe
       size: 7524
     - path: scripts/run_rag_pipeline.py
       hash: md5
-      md5: 8d5fc0669771146562c773186f4f44f6
-      size: 3667
+      md5: ea2b8d94ee42499870d925f916982e8a
+      size: 3781
     outs:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: 8ea0a3f240478e9db41855922ac534a6
-      size: 9894
+      md5: a4470a84d2de8b1d04c7d2dfd8b5f807
+      size: 9859
   generate-testset:
     cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
     outs:
@@ -133,35 +133,35 @@ stages:
     deps:
     - path: data/eidc_metadata.json
       hash: md5
-      md5: b4f3774a2921debb4d7740165ac604d4
-      size: 12157676
+      md5: 068ae066ea08ee369c505c8640481cf6
+      size: 125674
     - path: scripts/fetch_supporting_docs.py
       hash: md5
-      md5: 923af3b6ce1447d388b08fab0e3ab77d
-      size: 1660
+      md5: 02b94a2cc7bff711784cbdec3650b618
+      size: 1718
     outs:
     - path: data/supporting-docs.json
       hash: md5
-      md5: f3ea9980226e5408497c96a10cc77b80
-      size: 72013526
+      md5: bdab1ea8df4a87aa3d314044eb2eaa0a
+      size: 188762
   evaluate:
     cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
       -img data/eval.png
     deps:
     - path: data/evaluation_data.csv
       hash: md5
-      md5: 8ea0a3f240478e9db41855922ac534a6
-      size: 9894
+      md5: a4470a84d2de8b1d04c7d2dfd8b5f807
+      size: 9859
     - path: scripts/evaluate.py
       hash: md5
-      md5: 10f76511eafc8a1a9b90e9ae92a76bc5
-      size: 2633
+      md5: a9c4c04157007c12c068aacdf5e099a9
+      size: 2634
     outs:
     - path: data/eval.png
       hash: md5
-      md5: bae77b1b721bf283a30a64f67af45fea
-      size: 74438
+      md5: 981434fb5f4e61ce4288a4431f70bcc1
+      size: 67852
     - path: data/metrics.json
       hash: md5
-      md5: 0145280f36071a6df551ef57d3f8393e
-      size: 229
+      md5: 20efb8ebf0d6908f0ee7b35dbff2e7c7
+      size: 242
diff --git a/dvc.yaml b/dvc.yaml
@@ -1,6 +1,6 @@
 stages:
   fetch-metadata:
-    cmd: python scripts/fetch_eidc_metadata.py ${files.metadata}
+    cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample}
     deps:
     - scripts/fetch_eidc_metadata.py
     outs:
@@ -20,7 +20,7 @@ stages:
     outs:
     - ${files.extracted}
   chunk-data:
-    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} ${files.supporting-docs}
+    cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs}
     deps:
     - ${files.extracted}
     - ${files.supporting-docs}

diff --git a/params.yaml b/params.yaml
@@ -16,7 +16,7 @@ files:
   eval-set: data/evaluation_data.csv
   metrics: data/metrics.json
   eval-plot: data/eval.png
-sample-size: 10 # sample size of 0 will process all data
+sub-sample: 3 # sample size of 0 will process all data
 rag:
   model: llama3.1
   prompt: >-

diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py
@@ -28,28 +28,20 @@ def chunk_metadata_value(
 
 
 def chunk_metadata_file(
-    file: str, chunk_size: int, overlap: int, sample_size: int
+    file: str, chunk_size: int, overlap: int
 ) -> List[Dict[str, str]]:
     chunked_metadata = []
     with open(file) as f:
         json_data = json.load(f)
-        count = 0
         for metadata in json_data:
             chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap))
-            count += 1
-            if count == sample_size:
-                break
     return chunked_metadata
 
 
-def main(
-    files: List[str], ouput_file: str, chunk_size: int, overlap: int, sample_size: int
-) -> None:
+def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None:
     all_chunked_metadata = []
     for file in files:
-        all_chunked_metadata.extend(
-            chunk_metadata_file(file, chunk_size, overlap, sample_size)
-        )
+        all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap))
     with open(ouput_file, "w") as f:
         json.dump(all_chunked_metadata, f, indent=4)
 
@@ -81,14 +73,6 @@ def main(
         nargs="?",
         const=100,
     )
-    parser.add_argument(
-        "-s",
-        "--sample",
-        help="Only generate chunks for n datasets",
-        type=int,
-        nargs="?",
-        const=0,
-    )
     args = parser.parse_args()
     assert args.chunk > args.overlap
-    main(args.input_files, args.output, args.chunk, args.overlap, args.sample)
+    main(args.input_files, args.output, args.chunk, args.overlap)
diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py
@@ -6,7 +6,7 @@
 URL = "https://catalogue.ceh.ac.uk/eidc/documents"
 
 
-def main(output_file: str) -> None:
+def main(output_file: str, sample: int) -> None:
     res = requests.get(
         URL,
         headers={"content-type": "application/json"},
@@ -16,12 +16,22 @@ def main(output_file: str) -> None:
             "term": "recordType:Dataset",
         },
     )
+    json_data = res.json()
+    json_data["results"] = json_data["results"][:sample]
     with open(output_file, "w") as f:
-        json.dump(res.json(), f, indent=4)
+        json.dump(json_data, f, indent=4)
 
 
 if __name__ == "__main__":
     parser = ArgumentParser("fetch_eidc_metadata.py")
     parser.add_argument("output", help="The file path to save the downloaded data to.")
+    parser.add_argument(
+        "-s",
+        "--sample",
+        help="Only save n datasets",
+        type=int,
+        nargs="?",
+        const=0,
+    )
     args = parser.parse_args()
-    main(args.output)
+    main(args.output, args.sample)