From e14b95c95e8099b5267c54e312bda0cf073c9518 Mon Sep 17 00:00:00 2001 From: mpc Date: Thu, 14 Nov 2024 15:44:20 +0000 Subject: [PATCH] Added option to sub-sample initial metadata download --- dvc.lock | 110 ++++++++++++++++----------------- dvc.yaml | 4 +- params.yaml | 2 +- scripts/chunk_data.py | 24 ++----- scripts/fetch_eidc_metadata.py | 16 ++++- 5 files changed, 75 insertions(+), 81 deletions(-) diff --git a/dvc.lock b/dvc.lock index 1d52d2e..f520496 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1,17 +1,17 @@ schema: '2.0' stages: fetch-metadata: - cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json + cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json -s 3 deps: - path: scripts/fetch_eidc_metadata.py hash: md5 - md5: 53d620665448ef91f2deedb517e2f502 - size: 675 + md5: a564cb0804b482ef09658f0cb4a0a705 + size: 941 outs: - path: data/eidc_metadata.json hash: md5 - md5: b4f3774a2921debb4d7740165ac604d4 - size: 12157676 + md5: 068ae066ea08ee369c505c8640481cf6 + size: 125674 prepare: cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json deps: @@ -33,94 +33,94 @@ stages: deps: - path: data/eidc_metadata.json hash: md5 - md5: b4f3774a2921debb4d7740165ac604d4 - size: 12157676 + md5: 068ae066ea08ee369c505c8640481cf6 + size: 125674 - path: scripts/extract_metadata.py hash: md5 - md5: 3f0269a6413845f4425af55e7cea7bf8 - size: 1304 + md5: e66f21369c5106eaaad4476612c6fb5e + size: 1313 outs: - path: data/extracted_metadata.json hash: md5 - md5: 789fda7a14f9a85c6ee0e10af8170a95 - size: 4584498 + md5: e71f887d993834e3bda1eb00e711e724 + size: 7005 chunk-data: - cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s - 10 data/extracted_metadata.json data/supporting-docs.json + cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json + data/supporting-docs.json deps: - path: data/extracted_metadata.json hash: md5 - md5: 789fda7a14f9a85c6ee0e10af8170a95 - size: 4584498 + md5: e71f887d993834e3bda1eb00e711e724 + size: 7005 - path: data/supporting-docs.json hash: md5 - md5: f3ea9980226e5408497c96a10cc77b80 - size: 72013526 + md5: bdab1ea8df4a87aa3d314044eb2eaa0a + size: 188762 - path: scripts/chunk_data.py hash: md5 - md5: 681528e4aa1dc8cfb5fe5e5472e25fdf - size: 2509 + md5: e8de02d6b14c8fc22533d0becfb7d35d + size: 2198 outs: - path: data/chunked_data.json hash: md5 - md5: f6426396e1a3564b53649ef5fc0571fd - size: 993814 + md5: a01ff8ed4d429203d6903466d26937ff + size: 320740 create-embeddings: cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json deps: - path: data/chunked_data.json hash: md5 - md5: f6426396e1a3564b53649ef5fc0571fd - size: 993814 + md5: a01ff8ed4d429203d6903466d26937ff + size: 320740 - path: scripts/create_embeddings.py hash: md5 - md5: 4649c700dfae922b43b3608ee4f00c1a - size: 808 + md5: d9282fc92ed400855c4fc2a290289f14 + size: 867 outs: - path: data/embeddings.json hash: md5 - md5: 8fd682131a282736f6a81a6c53040b1e - size: 13422675 + md5: 363e3eaf7f8baddf9aa2e83f45f074b1 + size: 4345553 upload-to-docstore: cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data -em all-MiniLM-L6-v2 -c eidc-data deps: - path: data/embeddings.json hash: md5 - md5: 8fd682131a282736f6a81a6c53040b1e - size: 13422675 + md5: 363e3eaf7f8baddf9aa2e83f45f074b1 + size: 4345553 - path: scripts/upload_to_docstore.py hash: md5 - md5: 41da88e3bb6d2592bee938ce347f6983 - size: 1905 + md5: 7b9433047ff175d5e6af8d6056caf05b + size: 1931 outs: - path: data/chroma-data hash: md5 - md5: 5c99644f30def03f87b37c98341c6f25.dir - size: 13758136 - nfiles: 6 + md5: 39b81f6d319a02523fbc356dd667b920.dir + size: 5702372 + nfiles: 5 run-rag-pipeline: cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv data/chroma-data -c eidc-data deps: - path: data/chroma-data hash: md5 - md5: 5c99644f30def03f87b37c98341c6f25.dir - size: 13758136 - nfiles: 6 + md5: 39b81f6d319a02523fbc356dd667b920.dir + size: 5702372 + nfiles: 5 - path: data/eidc_rag_test_sample.csv hash: md5 md5: a371d83c5822d256286e80d64d58c3fe size: 7524 - path: scripts/run_rag_pipeline.py hash: md5 - md5: 8d5fc0669771146562c773186f4f44f6 - size: 3667 + md5: ea2b8d94ee42499870d925f916982e8a + size: 3781 outs: - path: data/evaluation_data.csv hash: md5 - md5: 8ea0a3f240478e9db41855922ac534a6 - size: 9894 + md5: a4470a84d2de8b1d04c7d2dfd8b5f807 + size: 9859 generate-testset: cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/ outs: @@ -133,35 +133,35 @@ stages: deps: - path: data/eidc_metadata.json hash: md5 - md5: b4f3774a2921debb4d7740165ac604d4 - size: 12157676 + md5: 068ae066ea08ee369c505c8640481cf6 + size: 125674 - path: scripts/fetch_supporting_docs.py hash: md5 - md5: 923af3b6ce1447d388b08fab0e3ab77d - size: 1660 + md5: 02b94a2cc7bff711784cbdec3650b618 + size: 1718 outs: - path: data/supporting-docs.json hash: md5 - md5: f3ea9980226e5408497c96a10cc77b80 - size: 72013526 + md5: bdab1ea8df4a87aa3d314044eb2eaa0a + size: 188762 evaluate: cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json -img data/eval.png deps: - path: data/evaluation_data.csv hash: md5 - md5: 8ea0a3f240478e9db41855922ac534a6 - size: 9894 + md5: a4470a84d2de8b1d04c7d2dfd8b5f807 + size: 9859 - path: scripts/evaluate.py hash: md5 - md5: 10f76511eafc8a1a9b90e9ae92a76bc5 - size: 2633 + md5: a9c4c04157007c12c068aacdf5e099a9 + size: 2634 outs: - path: data/eval.png hash: md5 - md5: bae77b1b721bf283a30a64f67af45fea - size: 74438 + md5: 981434fb5f4e61ce4288a4431f70bcc1 + size: 67852 - path: data/metrics.json hash: md5 - md5: 0145280f36071a6df551ef57d3f8393e - size: 229 + md5: 20efb8ebf0d6908f0ee7b35dbff2e7c7 + size: 242 diff --git a/dvc.yaml b/dvc.yaml index 0e9f154..2473eec 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,6 +1,6 @@ stages: fetch-metadata: - cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} + cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample} deps: - scripts/fetch_eidc_metadata.py outs: @@ -20,7 +20,7 @@ stages: outs: - ${files.extracted} chunk-data: - cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} ${files.supporting-docs} + cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs} deps: - ${files.extracted} - ${files.supporting-docs} diff --git a/params.yaml b/params.yaml index 85c3119..edf0085 100644 --- a/params.yaml +++ b/params.yaml @@ -16,7 +16,7 @@ files: eval-set: data/evaluation_data.csv metrics: data/metrics.json eval-plot: data/eval.png -sample-size: 10 # sample size of 0 will process all data +sub-sample: 3 # sample size of 0 will process all data rag: model: llama3.1 prompt: >- diff --git a/scripts/chunk_data.py b/scripts/chunk_data.py index 28707ed..7fe672b 100644 --- a/scripts/chunk_data.py +++ b/scripts/chunk_data.py @@ -28,28 +28,20 @@ def chunk_metadata_value( def chunk_metadata_file( - file: str, chunk_size: int, overlap: int, sample_size: int + file: str, chunk_size: int, overlap: int ) -> List[Dict[str, str]]: chunked_metadata = [] with open(file) as f: json_data = json.load(f) - count = 0 for metadata in json_data: chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap)) - count += 1 - if count == sample_size: - break return chunked_metadata -def main( - files: List[str], ouput_file: str, chunk_size: int, overlap: int, sample_size: int -) -> None: +def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None: all_chunked_metadata = [] for file in files: - all_chunked_metadata.extend( - chunk_metadata_file(file, chunk_size, overlap, sample_size) - ) + all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap)) with open(ouput_file, "w") as f: json.dump(all_chunked_metadata, f, indent=4) @@ -81,14 +73,6 @@ def main( nargs="?", const=100, ) - parser.add_argument( - "-s", - "--sample", - help="Only generate chunks for n datasets", - type=int, - nargs="?", - const=0, - ) args = parser.parse_args() assert args.chunk > args.overlap - main(args.input_files, args.output, args.chunk, args.overlap, args.sample) + main(args.input_files, args.output, args.chunk, args.overlap) diff --git a/scripts/fetch_eidc_metadata.py b/scripts/fetch_eidc_metadata.py index 0ab6297..5e883d9 100644 --- a/scripts/fetch_eidc_metadata.py +++ b/scripts/fetch_eidc_metadata.py @@ -6,7 +6,7 @@ URL = "https://catalogue.ceh.ac.uk/eidc/documents" -def main(output_file: str) -> None: +def main(output_file: str, sample: int) -> None: res = requests.get( URL, headers={"content-type": "application/json"}, @@ -16,12 +16,22 @@ def main(output_file: str) -> None: "term": "recordType:Dataset", }, ) + json_data = res.json() + json_data["results"] = json_data["results"][:sample] with open(output_file, "w") as f: - json.dump(res.json(), f, indent=4) + json.dump(json_data, f, indent=4) if __name__ == "__main__": parser = ArgumentParser("fetch_eidc_metadata.py") parser.add_argument("output", help="The file path to save the downloaded data to.") + parser.add_argument( + "-s", + "--sample", + help="Only save n datasets", + type=int, + nargs="?", + const=0, + ) args = parser.parse_args() - main(args.output) + main(args.output, args.sample)