Skip to content

Commit

Permalink
Merge pull request #12 from NERC-CEH/sub-sampling
Browse files Browse the repository at this point in the history
Added option to sub-sample initial metadata download
  • Loading branch information
matthewcoole authored Nov 15, 2024
2 parents cb9522b + e14b95c commit 0592fcd
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 81 deletions.
110 changes: 55 additions & 55 deletions dvc.lock
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
schema: '2.0'
stages:
fetch-metadata:
cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json
cmd: python scripts/fetch_eidc_metadata.py data/eidc_metadata.json -s 3
deps:
- path: scripts/fetch_eidc_metadata.py
hash: md5
md5: 53d620665448ef91f2deedb517e2f502
size: 675
md5: a564cb0804b482ef09658f0cb4a0a705
size: 941
outs:
- path: data/eidc_metadata.json
hash: md5
md5: b4f3774a2921debb4d7740165ac604d4
size: 12157676
md5: 068ae066ea08ee369c505c8640481cf6
size: 125674
prepare:
cmd: python scripts/extract_metadata.py data/eidc_metadata.json data/extracted_metadata.json
deps:
Expand All @@ -33,94 +33,94 @@ stages:
deps:
- path: data/eidc_metadata.json
hash: md5
md5: b4f3774a2921debb4d7740165ac604d4
size: 12157676
md5: 068ae066ea08ee369c505c8640481cf6
size: 125674
- path: scripts/extract_metadata.py
hash: md5
md5: 3f0269a6413845f4425af55e7cea7bf8
size: 1304
md5: e66f21369c5106eaaad4476612c6fb5e
size: 1313
outs:
- path: data/extracted_metadata.json
hash: md5
md5: 789fda7a14f9a85c6ee0e10af8170a95
size: 4584498
md5: e71f887d993834e3bda1eb00e711e724
size: 7005
chunk-data:
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 -s
10 data/extracted_metadata.json data/supporting-docs.json
cmd: python scripts/chunk_data.py -o data/chunked_data.json -c 500 -ol 100 data/extracted_metadata.json
data/supporting-docs.json
deps:
- path: data/extracted_metadata.json
hash: md5
md5: 789fda7a14f9a85c6ee0e10af8170a95
size: 4584498
md5: e71f887d993834e3bda1eb00e711e724
size: 7005
- path: data/supporting-docs.json
hash: md5
md5: f3ea9980226e5408497c96a10cc77b80
size: 72013526
md5: bdab1ea8df4a87aa3d314044eb2eaa0a
size: 188762
- path: scripts/chunk_data.py
hash: md5
md5: 681528e4aa1dc8cfb5fe5e5472e25fdf
size: 2509
md5: e8de02d6b14c8fc22533d0becfb7d35d
size: 2198
outs:
- path: data/chunked_data.json
hash: md5
md5: f6426396e1a3564b53649ef5fc0571fd
size: 993814
md5: a01ff8ed4d429203d6903466d26937ff
size: 320740
create-embeddings:
cmd: python scripts/create_embeddings.py data/chunked_data.json data/embeddings.json
deps:
- path: data/chunked_data.json
hash: md5
md5: f6426396e1a3564b53649ef5fc0571fd
size: 993814
md5: a01ff8ed4d429203d6903466d26937ff
size: 320740
- path: scripts/create_embeddings.py
hash: md5
md5: 4649c700dfae922b43b3608ee4f00c1a
size: 808
md5: d9282fc92ed400855c4fc2a290289f14
size: 867
outs:
- path: data/embeddings.json
hash: md5
md5: 8fd682131a282736f6a81a6c53040b1e
size: 13422675
md5: 363e3eaf7f8baddf9aa2e83f45f074b1
size: 4345553
upload-to-docstore:
cmd: python scripts/upload_to_docstore.py data/embeddings.json -o data/chroma-data
-em all-MiniLM-L6-v2 -c eidc-data
deps:
- path: data/embeddings.json
hash: md5
md5: 8fd682131a282736f6a81a6c53040b1e
size: 13422675
md5: 363e3eaf7f8baddf9aa2e83f45f074b1
size: 4345553
- path: scripts/upload_to_docstore.py
hash: md5
md5: 41da88e3bb6d2592bee938ce347f6983
size: 1905
md5: 7b9433047ff175d5e6af8d6056caf05b
size: 1931
outs:
- path: data/chroma-data
hash: md5
md5: 5c99644f30def03f87b37c98341c6f25.dir
size: 13758136
nfiles: 6
md5: 39b81f6d319a02523fbc356dd667b920.dir
size: 5702372
nfiles: 5
run-rag-pipeline:
cmd: python scripts/run_rag_pipeline.py data/eidc_rag_test_sample.csv data/evaluation_data.csv
data/chroma-data -c eidc-data
deps:
- path: data/chroma-data
hash: md5
md5: 5c99644f30def03f87b37c98341c6f25.dir
size: 13758136
nfiles: 6
md5: 39b81f6d319a02523fbc356dd667b920.dir
size: 5702372
nfiles: 5
- path: data/eidc_rag_test_sample.csv
hash: md5
md5: a371d83c5822d256286e80d64d58c3fe
size: 7524
- path: scripts/run_rag_pipeline.py
hash: md5
md5: 8d5fc0669771146562c773186f4f44f6
size: 3667
md5: ea2b8d94ee42499870d925f916982e8a
size: 3781
outs:
- path: data/evaluation_data.csv
hash: md5
md5: 8ea0a3f240478e9db41855922ac534a6
size: 9894
md5: a4470a84d2de8b1d04c7d2dfd8b5f807
size: 9859
generate-testset:
cmd: cp data/synthetic-datasets/eidc_rag_test_sample.csv data/
outs:
Expand All @@ -133,35 +133,35 @@ stages:
deps:
- path: data/eidc_metadata.json
hash: md5
md5: b4f3774a2921debb4d7740165ac604d4
size: 12157676
md5: 068ae066ea08ee369c505c8640481cf6
size: 125674
- path: scripts/fetch_supporting_docs.py
hash: md5
md5: 923af3b6ce1447d388b08fab0e3ab77d
size: 1660
md5: 02b94a2cc7bff711784cbdec3650b618
size: 1718
outs:
- path: data/supporting-docs.json
hash: md5
md5: f3ea9980226e5408497c96a10cc77b80
size: 72013526
md5: bdab1ea8df4a87aa3d314044eb2eaa0a
size: 188762
evaluate:
cmd: python scripts/evaluate.py data/evaluation_data.csv -m data/metrics.json
-img data/eval.png
deps:
- path: data/evaluation_data.csv
hash: md5
md5: 8ea0a3f240478e9db41855922ac534a6
size: 9894
md5: a4470a84d2de8b1d04c7d2dfd8b5f807
size: 9859
- path: scripts/evaluate.py
hash: md5
md5: 10f76511eafc8a1a9b90e9ae92a76bc5
size: 2633
md5: a9c4c04157007c12c068aacdf5e099a9
size: 2634
outs:
- path: data/eval.png
hash: md5
md5: bae77b1b721bf283a30a64f67af45fea
size: 74438
md5: 981434fb5f4e61ce4288a4431f70bcc1
size: 67852
- path: data/metrics.json
hash: md5
md5: 0145280f36071a6df551ef57d3f8393e
size: 229
md5: 20efb8ebf0d6908f0ee7b35dbff2e7c7
size: 242
4 changes: 2 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
stages:
fetch-metadata:
cmd: python scripts/fetch_eidc_metadata.py ${files.metadata}
cmd: python scripts/fetch_eidc_metadata.py ${files.metadata} -s ${sub-sample}
deps:
- scripts/fetch_eidc_metadata.py
outs:
Expand All @@ -20,7 +20,7 @@ stages:
outs:
- ${files.extracted}
chunk-data:
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} -s ${sample-size} ${files.extracted} ${files.supporting-docs}
cmd: python scripts/chunk_data.py -o ${files.chunked} -c ${hp.chunk-size} -ol ${hp.overlap} ${files.extracted} ${files.supporting-docs}
deps:
- ${files.extracted}
- ${files.supporting-docs}
Expand Down
2 changes: 1 addition & 1 deletion params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ files:
eval-set: data/evaluation_data.csv
metrics: data/metrics.json
eval-plot: data/eval.png
sample-size: 10 # sample size of 0 will process all data
sub-sample: 3 # sample size of 0 will process all data
rag:
model: llama3.1
prompt: >-
Expand Down
24 changes: 4 additions & 20 deletions scripts/chunk_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,28 +28,20 @@ def chunk_metadata_value(


def chunk_metadata_file(
file: str, chunk_size: int, overlap: int, sample_size: int
file: str, chunk_size: int, overlap: int
) -> List[Dict[str, str]]:
chunked_metadata = []
with open(file) as f:
json_data = json.load(f)
count = 0
for metadata in json_data:
chunked_metadata.extend(chunk_metadata_value(metadata, chunk_size, overlap))
count += 1
if count == sample_size:
break
return chunked_metadata


def main(
files: List[str], ouput_file: str, chunk_size: int, overlap: int, sample_size: int
) -> None:
def main(files: List[str], ouput_file: str, chunk_size: int, overlap: int) -> None:
all_chunked_metadata = []
for file in files:
all_chunked_metadata.extend(
chunk_metadata_file(file, chunk_size, overlap, sample_size)
)
all_chunked_metadata.extend(chunk_metadata_file(file, chunk_size, overlap))
with open(ouput_file, "w") as f:
json.dump(all_chunked_metadata, f, indent=4)

Expand Down Expand Up @@ -81,14 +73,6 @@ def main(
nargs="?",
const=100,
)
parser.add_argument(
"-s",
"--sample",
help="Only generate chunks for n datasets",
type=int,
nargs="?",
const=0,
)
args = parser.parse_args()
assert args.chunk > args.overlap
main(args.input_files, args.output, args.chunk, args.overlap, args.sample)
main(args.input_files, args.output, args.chunk, args.overlap)
16 changes: 13 additions & 3 deletions scripts/fetch_eidc_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
URL = "https://catalogue.ceh.ac.uk/eidc/documents"


def main(output_file: str) -> None:
def main(output_file: str, sample: int) -> None:
res = requests.get(
URL,
headers={"content-type": "application/json"},
Expand All @@ -16,12 +16,22 @@ def main(output_file: str) -> None:
"term": "recordType:Dataset",
},
)
json_data = res.json()
json_data["results"] = json_data["results"][:sample]
with open(output_file, "w") as f:
json.dump(res.json(), f, indent=4)
json.dump(json_data, f, indent=4)


if __name__ == "__main__":
parser = ArgumentParser("fetch_eidc_metadata.py")
parser.add_argument("output", help="The file path to save the downloaded data to.")
parser.add_argument(
"-s",
"--sample",
help="Only save n datasets",
type=int,
nargs="?",
const=0,
)
args = parser.parse_args()
main(args.output)
main(args.output, args.sample)

0 comments on commit 0592fcd

Please sign in to comment.